194 files changed, 10506 insertions, 3210 deletions
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
new file mode 100644
index 000000000000..52257c10b0bf
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
new file mode 100644
index 000000000000..0f8cf5c73545
--- /dev/null
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV64
+; RUN: opt -mtriple=riscv32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV32
+; RUN: opt -mtriple=loongarch64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH64
+; RUN: opt -mtriple=loongarch32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH32
+; RUN: opt -mtriple=aarch64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: opt -mtriple=aarch64_be -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_BE
+; RUN: opt -mtriple=aarch64_32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_32
+
+define void @f1() "instrument-function-entry-inlined"="_mcount" {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @_mcount(ptr [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AARCH64: {{.*}}
+; AARCH64_32: {{.*}}
+; AARCH64_BE: {{.*}}
+; LOONGARCH32: {{.*}}
+; LOONGARCH64: {{.*}}
+; RISCV32: {{.*}}
+; RISCV64: {{.*}}
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
index bd5f4c2b51a8..56ccfb9ed2e7 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
@@ -129,6 +129,13 @@ define void @naked() naked {
   ret void
 }
 
+define available_externally void @always_inline() {
+; CHECK-LABEL: define available_externally void @always_inline() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
 ; The attributes are "consumed" when the instrumentation is inserted.
 ; CHECK: attributes
 ; CHECK-NOT: instrument-function
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 92439691e187..179b5b0a3dbf 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -45,11 +45,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
@@ -63,11 +59,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP3]], i32 [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
@@ -83,11 +75,7 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
@@ -103,11 +91,7 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
@@ -155,11 +139,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP3]], i64 [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index d71ae8be19b6..0507ec9de542 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -71,11 +71,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
 ; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; X32-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4)
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index f686e2997556..86dc3e5245f2 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -73,11 +73,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
 ; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
@@ -189,11 +185,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
 ; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
index 50c4dc4665c9..1840f045b3ff 100644
--- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll
+++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
@@ -3,13 +3,9 @@
 ; RUN: echo '!19 = !{!"%/t/exit-block.ll", !0}' > %t/1
 ; RUN: cat %s %t/1 > %t/2
 
-; By default, the exit block is the second.
+; The exit block is the second.
 ; RUN: opt -passes=insert-gcov-profiling -disable-output %t/2
-; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s
-
-; But we can optionally emit it last, to match GCC<4.8 (r189778).
-; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2
-; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-LAST %s
+; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -66,10 +62,7 @@ attributes #2 = { nounwind }
 
 ; There should be no destination edges for the exit block.
 ; CHECK: Block : 1 Counter : 0
-; EXIT-LAST:       Destination Edges
-; EXIT-SECOND-NOT: Destination Edges
 ; CHECK: Block : 2 Counter : 0
 ; CHECK: Block : 4 Counter : 0
-; EXIT-LAST-NOT: Destination Edges
-; EXIT-SECOND:   Destination Edges
+; CHECK:         Destination Edges
 ; CHECK-NOT: Block :
diff --git a/llvm/test/Transforms/GCOVProfiling/version.ll b/llvm/test/Transforms/GCOVProfiling/version.ll
index bfac2557da0b..4751bc1bd6dc 100644
--- a/llvm/test/Transforms/GCOVProfiling/version.ll
+++ b/llvm/test/Transforms/GCOVProfiling/version.ll
@@ -5,16 +5,16 @@
 ; RUN: cat %t/little.txt %s %t/version.txt > %t/2
 
 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/2
-; RUN: head -c8 %t/version.gcno | grep '^oncg.804'
+; RUN: head -c8 %t/version.gcno | grep '^oncg.11B'
 ; RUN: rm %t/version.gcno
 ; RUN: not opt -passes=insert-gcov-profiling -default-gcov-version=asdfasdf -disable-output < %t/2
-; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='402*' -disable-output < %t/2
-; RUN: head -c8 %t/version.gcno | grep '^oncg.204'
+; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='B21*' -disable-output < %t/2
+; RUN: head -c8 %t/version.gcno | grep '^oncg.12B'
 ; RUN: rm %t/version.gcno
 
 ; RUN: cat %t/big.txt %s %t/version.txt > %t/big.ll
 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/big.ll
-; RUN: head -c8 %t/version.gcno | grep '^gcno408.'
+; RUN: head -c8 %t/version.gcno | grep '^gcnoB11.'
 
 define void @test() !dbg !5 {
   ret void, !dbg !8
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
index 7ffbd64c700a..4cd1491611be 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
@@ -391,9 +391,33 @@ define void @nonzt0_callee() {
   ret void
 }
 
+define void @new_zt0_callee() "aarch64_new_zt0" {
+; CHECK-LABEL: define void @new_zt0_callee
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; inlineasm", ""()
+  call void @inlined_body()
+  ret void
+}
+
+define void @nonzt0_caller_new_zt0_callee_dont_inline() {
+; CHECK-LABEL: define void @nonzt0_caller_new_zt0_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_zt0_callee()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @new_zt0_callee()
+  ret void
+}
+
 define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_nonzt0_callee_dont_inline
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    call void @nonzt0_callee()
 ; CHECK-NEXT:    ret void
 ;
@@ -403,7 +427,7 @@ define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 
 define void @shared_zt0_callee() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_callee
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
@@ -415,7 +439,7 @@ define void @shared_zt0_callee() "aarch64_inout_zt0" {
 
 define void @shared_zt0_caller_shared_zt0_callee_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_shared_zt0_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
index 84462f9a7f59..d4edf12eba6a 100644
--- a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
+++ b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
@@ -12,6 +12,17 @@ define i8 @add-shl-sdiv-scalar0(i8 %x) {
   ret i8 %rz
 }
 
+define i8 @add-shl-sdiv-scalar0_commuted(i8 %x) {
+; CHECK-LABEL: @add-shl-sdiv-scalar0_commuted(
+; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 4
+; CHECK-NEXT:    ret i8 [[RZ]]
+;
+  %sd = sdiv i8 %x, -4
+  %sl = shl i8 %sd, 2
+  %rz = add i8 %x, %sl
+  ret i8 %rz
+}
+
 define i8 @add-shl-sdiv-scalar1(i8 %x) {
 ; CHECK-LABEL: @add-shl-sdiv-scalar1(
 ; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 64
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index fffe1f842669..9651858a0cae 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -1445,8 +1445,7 @@ define i1 @bitwise_and_logical_and_icmps_comm2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @bitwise_and_logical_and_icmps_comm2(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42
 ; CHECK-NEXT:    [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[Z_SHIFT]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i8 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[TMP4]], i1 [[C1]], i1 false
@@ -1796,8 +1795,7 @@ define i1 @bitwise_or_logical_or_icmps_comm2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @bitwise_or_logical_or_icmps_comm2(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42
 ; CHECK-NEXT:    [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[Z_SHIFT]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i8 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[OR2:%.*]] = select i1 [[TMP4]], i1 true, i1 [[C1]]
@@ -2068,12 +2066,10 @@ define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison1(i1 %c, i32 %x, i
 
 define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison2(i1 %c, i32 %x, i32 %y) {
 ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allzeros_poison2(
-; CHECK-NEXT:    [[X_M1:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[C1:%.*]] = icmp eq i32 [[X_M1]], 0
-; CHECK-NEXT:    [[AND1:%.*]] = select i1 [[C1]], i1 [[C:%.*]], i1 false
-; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = or i32 [[Y1:%.*]], 8
+; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X_M2]], 0
-; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C2]]
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[C2]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[AND2]]
 ;
   %x.m1 = and i32 %x, 8
@@ -2120,12 +2116,10 @@ define i1 @bitwise_and_logical_and_masked_icmp_allones_poison1(i1 %c, i32 %x, i3
 
 define i1 @bitwise_and_logical_and_masked_icmp_allones_poison2(i1 %c, i32 %x, i32 %y) {
 ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allones_poison2(
-; CHECK-NEXT:    [[X_M1:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[C1:%.*]] = icmp ne i32 [[X_M1]], 0
-; CHECK-NEXT:    [[AND1:%.*]] = select i1 [[C1]], i1 [[C:%.*]], i1 false
-; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = or i32 [[Y1:%.*]], 8
+; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X_M2]], [[Y]]
-; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C2]]
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[C2]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[AND2]]
 ;
   %x.m1 = and i32 %x, 8
diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll
index 5a0890e918ef..5a58995f6c31 100644
--- a/llvm/test/Transforms/InstCombine/and-xor-or.ll
+++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll
@@ -388,10 +388,9 @@ define i8 @xor_shl(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_shl
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -406,10 +405,9 @@ define i8 @and_lshr(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@and_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SY]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -435,6 +433,51 @@ define i8 @or_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
   ret i8 %r
 }
 
+define i8 @or_lshr_commuted1(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted1
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted2(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted2
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted3(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted3
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
 define i8 @xor_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll
index 43cd6dd1211b..936c02c9f905 100644
--- a/llvm/test/Transforms/InstCombine/bit-checks.ll
+++ b/llvm/test/Transforms/InstCombine/bit-checks.ll
@@ -1335,6 +1335,22 @@ define i1 @no_masks_with_logical_or(i32 %a, i32 %b, i32 noundef %c) {
   ret i1 %or2
 }
 
+define i1 @no_masks_with_logical_or_commuted(i32 %a, i32 %b, i32 noundef %c) {
+; CHECK-LABEL: @no_masks_with_logical_or_commuted(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63
+; CHECK-NEXT:    [[C:%.*]] = or i32 [[C1:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C]], 0
+; CHECK-NEXT:    [[OR2:%.*]] = select i1 [[CMP3]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR2]]
+;
+  %cmp1 = icmp ne i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 63
+  %or1 = select i1 %cmp1, i1 true, i1 %cmp2
+  %cmp3 = icmp ne i32 %c, 0
+  %or2 = or i1 %cmp3, %or1
+  ret i1 %or2
+}
+
 define i1 @no_masks_with_logical_or2(i32 %a, i32 %b, i32 noundef %c) {
 ; CHECK-LABEL: @no_masks_with_logical_or2(
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63
diff --git a/llvm/test/Transforms/InstCombine/compare-signs.ll b/llvm/test/Transforms/InstCombine/compare-signs.ll
index 9703b47b44d0..59ec9adb30b9 100644
--- a/llvm/test/Transforms/InstCombine/compare-signs.ll
+++ b/llvm/test/Transforms/InstCombine/compare-signs.ll
@@ -152,6 +152,19 @@ define i1 @test4a(i32 %a) {
   ret i1 %c
 }
 
+define i1 @test4a_commuted(i32 %a) {
+; CHECK-LABEL: @test4a_commuted(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[SIGNUM:%.*]], 1
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %l = ashr i32 %a, 31
+  %na = sub i32 0, %a
+  %r = lshr i32 %na, 31
+  %signum = or i32 %r, %l
+  %c = icmp slt i32 %signum, 1
+  ret i1 %c
+}
+
 define <2 x i1> @test4a_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @test4a_vec(
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> [[A:%.*]], splat (i32 1)
diff --git a/llvm/test/Transforms/InstCombine/copysign.ll b/llvm/test/Transforms/InstCombine/copysign.ll
index abc707acf0cd..ee093a709eab 100644
--- a/llvm/test/Transforms/InstCombine/copysign.ll
+++ b/llvm/test/Transforms/InstCombine/copysign.ll
@@ -82,7 +82,7 @@ define float @not_known_positive_sign_arg(float %x, float %y) {
 
 define float @copysign_sign_arg(float %x, float %y, float %z) {
 ; CHECK-LABEL: @copysign_sign_arg(
-; CHECK-NEXT:    [[R:%.*]] = call ninf float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %s = call reassoc float @llvm.copysign.f32(float %y, float %z)
@@ -90,6 +90,26 @@ define float @copysign_sign_arg(float %x, float %y, float %z) {
   ret float %r
 }
 
+define float @copysign_sign_arg_nnan(float %x, float %y, float %z) {
+; CHECK-LABEL: @copysign_sign_arg_nnan(
+; CHECK-NEXT:    [[R:%.*]] = call nnan float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %s = call nnan float @llvm.copysign.f32(float %y, float %z)
+  %r = call nnan float @llvm.copysign.f32(float %x, float %s)
+  ret float %r
+}
+
+define float @copysign_sign_arg_mixed(float %x, float %y, float %z) {
+; CHECK-LABEL: @copysign_sign_arg_mixed(
+; CHECK-NEXT:    [[R:%.*]] = call nsz float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %s = call ninf nsz float @llvm.copysign.f32(float %y, float %z)
+  %r = call nnan nsz float @llvm.copysign.f32(float %x, float %s)
+  ret float %r
+}
+
 define float @fneg_mag(float %x, float %y) {
 ; CHECK-LABEL: @fneg_mag(
 ; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
index 0d0af91608e7..15fad55db8df 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
@@ -19,7 +19,7 @@ define float @test_fcmp_ogt_fadd_select_constant(float %in) {
 define float @test_fcmp_ogt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -87,7 +87,7 @@ define float @test_fcmp_olt_fadd_select_constant(float %in) {
 define float @test_fcmp_olt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -155,7 +155,7 @@ define float @test_fcmp_oge_fadd_select_constant(float %in) {
 define float @test_fcmp_oge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -223,7 +223,7 @@ define float @test_fcmp_ole_fadd_select_constant(float %in) {
 define float @test_fcmp_ole_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -293,7 +293,7 @@ define float @test_fcmp_ugt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ugt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp ole float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -366,7 +366,7 @@ define float @test_fcmp_uge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_uge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp olt float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -439,7 +439,7 @@ define float @test_fcmp_ult_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ult_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp oge float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -512,7 +512,7 @@ define float @test_fcmp_ule_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ule_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp ogt float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 579247aaccf2..a8cdf80948a8 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -79,6 +79,19 @@ bb:
   ret i1 %i4
 }
 
+define i1 @cvt_icmp_0_zext_plus_zext_eq_i2(i1 %a, i1 %b) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq_i2(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = xor i1 [[TMP1]], true
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a.ext = zext i1 %a to i2
+  %b.ext = zext i1 %b to i2
+  %add = add i2 %a.ext, %b.ext
+  %cmp = icmp eq i2 %add, 0
+  ret i1 %cmp
+}
+
 define i1 @cvt_icmp_1_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
 ; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_eq(
 ; CHECK-NEXT:  bb:
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index f9b90c224d83..7f8f1ae73948 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -709,3 +709,51 @@ define i1 @pointer_icmp_aligned_with_offset_negative(ptr align 8 %a, ptr align 8
   %cmp = icmp eq ptr %gep, %a2
   ret i1 %cmp
 }
+
+define i1 @gep_diff_base_same_indices(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr i8, ptr %x, i64 %z
+  %gep2 = getelementptr i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nuw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nusw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw_nusw(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr nuw i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr nusw i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/onehot_merge.ll b/llvm/test/Transforms/InstCombine/onehot_merge.ll
index 2e57597455c2..3b7314d36eaa 100644
--- a/llvm/test/Transforms/InstCombine/onehot_merge.ll
+++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll
@@ -1143,3 +1143,19 @@ define i1 @foo1_and_signbit_lshr_without_shifting_signbit_not_pwr2_logical(i32 %
   %or = select i1 %t2, i1 true, i1 %t4
   ret i1 %or
 }
+
+define i1 @two_types_of_bittest(i8 %x, i8 %c) {
+; CHECK-LABEL: @two_types_of_bittest(
+; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[C:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[T0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %t0 = shl i8 1, %c
+  %icmp1 = icmp slt i8 %x, 0
+  %and = and i8 %x, %t0
+  %icmp2 = icmp ne i8 %and, 0
+  %ret = and i1 %icmp1, %icmp2
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index bac51c82f36d..b05274658e81 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -654,6 +654,64 @@ join:
   ret ptr %gep
 }
 
+define ptr @gep_of_phi_of_gep_flags1(i1 %c, ptr %p) {
+; CHECK-LABEL: @gep_of_phi_of_gep_flags1(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 1
+  br label %join
+
+else:
+  %gep2 = getelementptr i32, ptr %p, i64 2
+  br label %join
+
+join:
+  %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ]
+  %gep = getelementptr i32, ptr %phi, i64 1
+  ret ptr %gep
+}
+
+define ptr @gep_of_phi_of_gep_flags2(i1 %c, ptr %p) {
+; CHECK-LABEL: @gep_of_phi_of_gep_flags2(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr nuw i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %gep1 = getelementptr nuw i32, ptr %p, i64 1
+  br label %join
+
+else:
+  %gep2 = getelementptr nuw i32, ptr %p, i64 2
+  br label %join
+
+join:
+  %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ]
+  %gep = getelementptr i32, ptr %phi, i64 1
+  ret ptr %gep
+}
+
 define ptr @gep_of_phi_of_gep_different_type(i1 %c, ptr %p) {
 ; CHECK-LABEL: @gep_of_phi_of_gep_different_type(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/overflow.ll b/llvm/test/Transforms/InstCombine/overflow.ll
index a8969a5ed02c..22e1631f78ee 100644
--- a/llvm/test/Transforms/InstCombine/overflow.ll
+++ b/llvm/test/Transforms/InstCombine/overflow.ll
@@ -11,7 +11,7 @@ define i32 @test1(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i32, i1 } [[SADD]], 1
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i32, i1 } [[SADD]], 0
@@ -49,7 +49,7 @@ define i32 @test2(i32 %a, i32 %b, ptr %P) nounwind ssp {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
@@ -86,7 +86,7 @@ define i64 @test3(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i64 [[ADD]]
@@ -116,7 +116,7 @@ define zeroext i8 @test4(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-NEXT:    [[CMP:%.*]] = extractvalue { i8, i1 } [[SADD]], 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i8, i1 } [[SADD]], 0
@@ -150,7 +150,7 @@ define i32 @test8(i64 %a, i64 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
@@ -171,3 +171,91 @@ if.end:
   ret i32 %conv9
 }
 
+define i32 @uadd_no_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @uadd_no_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @smul_no_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_no_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @smul_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_overflow(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[OV]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  tail call void @llvm.assume(i1 %ov)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @uadd_no_overflow_invalid1(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @uadd_no_overflow_invalid1(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    call void @use(i32 [[RES]])
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    [[NOWRAP:%.*]] = xor i1 [[OV]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOWRAP]])
+; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %res = extractvalue { i32, i1 } %val, 0
+  call void @use(i32 %res)
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  ret i32 %res
+if.else:
+  ret i32 0
+}
+
+define i32 @uadd_no_overflow_invalid2(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @uadd_no_overflow_invalid2(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    [[NOWRAP:%.*]] = xor i1 [[OV]], true
+; CHECK-NEXT:    call void @use(i32 0)
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOWRAP]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  call void @use(i32 0) ; It is not guaranteed to transfer execution to its successors
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+declare void @use(i32)
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index e3bf5ef67381..4756b4f30e56 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -2822,3 +2822,179 @@ for.cond:                                         ; preds = %for.cond, %entry
 exit:                                             ; preds = %for.cond
   ret i64 0
 }
+
+define i1 @test_zext_icmp_eq_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[B:%.*]], true
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[CMP:%.*]] = phi i1 [ [[TMP0]], [[IF]] ], [ [[TMP1]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_ne_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_ne_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp ne i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_1(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[C:%.*]], 1
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 1
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_loop(i1 %c, i1 %b) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[Y:%.*]] = and i1 [[X]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP0]] = xor i1 [[Y]], true
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i32 [ 1, %entry ], [ %ext, %loop ]
+  %x = icmp eq i32 %phi, 0
+  %y = and i1 %x, %b
+  %ext = zext i1 %y to i32
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret i1 %x
+}
+
+define i1 @test_zext_icmp_eq_0_multi_use(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_multi_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i1 [[B:%.*]] to i32
+; CHECK-NEXT:    call void @use(i32 [[B_EXT]])
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  call void @use(i32 %b.ext)
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_not_bool(i1 %a, i2 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_not_bool(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i2 [[B:%.*]] to i32
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i2 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
index e7d6cc7102c7..920497c07e38 100644
--- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
+++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
@@ -372,6 +372,32 @@ define <2 x i8> @srem_XY_XZ_with_CY_gt_CZ_no_nuw_out(<2 x i8> %X) {
   ret <2 x i8> %r
 }
 
+define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(i8 noundef %X) {
+; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(
+; CHECK-NEXT:    [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[BO1:%.*]] = shl nsw i8 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = srem i8 [[BO1]], [[BO0]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %BO0 = mul nsw i8 %X, 127
+  %BO1 = shl nsw i8 %X, 7
+  %r = srem i8 %BO1, %BO0
+  ret i8 %r
+}
+
+define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(i8 noundef %X) {
+; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(
+; CHECK-NEXT:    [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[BO1:%.*]] = shl nsw i8 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = srem i8 [[BO0]], [[BO1]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %BO0 = mul nsw i8 %X, 127
+  %BO1 = shl nsw i8 %X, 7
+  %r = srem i8 %BO0, %BO1
+  ret i8 %r
+}
+
 define i8 @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(i8 %X) {
 ; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(
 ; CHECK-NEXT:    [[BO0:%.*]] = mul nuw nsw i8 [[X:%.*]], 10
diff --git a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
index 35b4087d767a..2cb70e85f435 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
@@ -495,6 +495,19 @@ define i32 @test_cttz_not_bw(i32 %x) {
   ret i32 %res
 }
 
+define i32 @test_cttz_not_bw_noundef(i32 %x) {
+; CHECK-LABEL: @test_cttz_not_bw_noundef(
+; CHECK-NEXT:    [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP_NOT]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call noundef i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %res = select i1 %cmp, i32 %ct, i32 123
+  ret i32 %res
+}
+
 define i32 @test_cttz_not_bw_multiuse(i32 %x) {
 ; CHECK-LABEL: @test_cttz_not_bw_multiuse(
 ; CHECK-NEXT:    [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll
index a674f9c64b20..7dff78e3057e 100644
--- a/llvm/test/Transforms/InstCombine/select-divrem.ll
+++ b/llvm/test/Transforms/InstCombine/select-divrem.ll
@@ -322,6 +322,21 @@ define i8 @rem_euclid_non_const_pow2(i8 %0, i8 %1) {
   ret i8 %sel
 }
 
+define i8 @rem_euclid_non_const_pow2_commuted(i8 %0, i8 %1) {
+; CHECK-LABEL: @rem_euclid_non_const_pow2_commuted(
+; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw i8 -1, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i8 [[NOTMASK]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[TMP1:%.*]], [[TMP3]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %pow2 = shl i8 1, %0
+  %rem = srem i8 %1, %pow2
+  %cond = icmp slt i8 %rem, 0
+  %add = add i8 %pow2, %rem
+  %sel = select i1 %cond, i8 %add, i8 %rem
+  ret i8 %sel
+}
+
 define i32 @rem_euclid_pow2_true_arm_folded(i32 %n) {
 ; CHECK-LABEL: @rem_euclid_pow2_true_arm_folded(
 ; CHECK-NEXT:    [[RES:%.*]] = and i32 [[N:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 0168a804239a..0f15fa6c0cc5 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3937,11 +3937,8 @@ entry:
 define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_and_xor(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -3956,11 +3953,8 @@ entry:
 define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_xor_and(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -4438,11 +4432,8 @@ define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4453,11 +4444,8 @@ define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4492,11 +4480,8 @@ define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp ne i32 %or, 0
@@ -4843,3 +4828,16 @@ define i32 @replace_and_cond_multiuse2(i1 %cond1, i1 %cond2) {
   %mux = select i1 %cond1, i32 %sel, i32 1
   ret i32 %mux
 }
+
+define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_simplify_2x_at_once_and(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -1
+  %sub = sub i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %sub, i32 %xor
+  ret i32 %cond
+}
diff --git a/llvm/test/Transforms/InstCombine/xor-and-or.ll b/llvm/test/Transforms/InstCombine/xor-and-or.ll
index 47275ce31070..c380e2748f89 100644
--- a/llvm/test/Transforms/InstCombine/xor-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/xor-and-or.ll
@@ -25,6 +25,18 @@ define i1 @xor_logic_and_logic_or2(i1 %c, i1 %x, i1 %y) {
   ret i1 %r
 }
 
+define i1 @xor_logic_and_logic_or2_commuted(i1 %c, i1 %x, i1 %y) {
+; CHECK-LABEL: @xor_logic_and_logic_or2_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i1 [[TMP1]], i1 [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %o = select i1 %y, i1 true, i1 %c
+  %a = select i1 %c, i1 %x, i1 false
+  %r = xor i1 %o, %a
+  ret i1 %r
+}
+
 define i1 @xor_logic_and_logic_or3(i1 %c, i1 %x, i1 %y) {
 ; CHECK-LABEL: @xor_logic_and_logic_or3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i1 [[C:%.*]]
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
new file mode 100644
index 000000000000..543c73137c1b
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2i/f2ui and d2i/d2ui - double/float to i32 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double -1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+declare i32 @llvm.nvvm.f2i.rm(float)
+declare i32 @llvm.nvvm.f2i.rn(float)
+declare i32 @llvm.nvvm.f2i.rp(float)
+declare i32 @llvm.nvvm.f2i.rz(float)
+
+declare i32 @llvm.nvvm.f2i.rm.ftz(float)
+declare i32 @llvm.nvvm.f2i.rn.ftz(float)
+declare i32 @llvm.nvvm.f2i.rp.ftz(float)
+declare i32 @llvm.nvvm.f2i.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2i.rm(double)
+declare i32 @llvm.nvvm.d2i.rn(double)
+declare i32 @llvm.nvvm.d2i.rp(double)
+declare i32 @llvm.nvvm.d2i.rz(double)
+
+
+declare i32 @llvm.nvvm.f2ui.rm(float)
+declare i32 @llvm.nvvm.f2ui.rn(float)
+declare i32 @llvm.nvvm.f2ui.rp(float)
+declare i32 @llvm.nvvm.f2ui.rz(float)
+
+declare i32 @llvm.nvvm.f2ui.rm.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rn.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rp.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2ui.rm(double)
+declare i32 @llvm.nvvm.d2ui.rn(double)
+declare i32 @llvm.nvvm.d2ui.rp(double)
+declare i32 @llvm.nvvm.d2ui.rz(double)
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
new file mode 100644
index 000000000000..be38177dce2c
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2ll/f2ull and d2ll/d2ull - double/float to i64 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double -1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+declare i64 @llvm.nvvm.f2ll.rm(float)
+declare i64 @llvm.nvvm.f2ll.rn(float)
+declare i64 @llvm.nvvm.f2ll.rp(float)
+declare i64 @llvm.nvvm.f2ll.rz(float)
+
+declare i64 @llvm.nvvm.f2ll.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ll.rm(double)
+declare i64 @llvm.nvvm.d2ll.rn(double)
+declare i64 @llvm.nvvm.d2ll.rp(double)
+declare i64 @llvm.nvvm.d2ll.rz(double)
+
+
+declare i64 @llvm.nvvm.f2ull.rm(float)
+declare i64 @llvm.nvvm.f2ull.rn(float)
+declare i64 @llvm.nvvm.f2ull.rp(float)
+declare i64 @llvm.nvvm.f2ull.rz(float)
+
+declare i64 @llvm.nvvm.f2ull.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ull.rm(double)
+declare i64 @llvm.nvvm.d2ull.rn(double)
+declare i64 @llvm.nvvm.d2ull.rp(double)
+declare i64 @llvm.nvvm.d2ull.rz(double)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
new file mode 100644
index 000000000000..fe8a7e58a6a5
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s
+
+; This test verifies that the vectorizer can handle an extended sequence of
+; getelementptr instructions and generate longer vectors. With special handling,
+; some elements can still be vectorized even if they require looking up the
+; common underlying object deeper than 6 levels from the original pointer.
+
+; The test below is the simplified version of actual performance oriented
+; workload; the offsets in getelementptr instructions are similar or same for
+; the test simplicity.
+
+define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 8
+  %d8 = getelementptr i8, ptr %b7, i32 12
+
+  store half 0xH0000, ptr %a6, align 16
+  store <4 x half> zeroinitializer, ptr %b7, align 2
+  store <2 x half> zeroinitializer, ptr %c8, align 2
+  store half 0xH0000, ptr %d8, align 2
+  ret void
+}
+
+define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 2
+  %d9 = getelementptr i8, ptr %c8, i32 2
+  %e10 = getelementptr i8, ptr %d9, i32 2
+  %f11 = getelementptr i8, ptr %e10, i32 2
+  %g12 = getelementptr i8, ptr %f11, i32 2
+  %h13 = getelementptr i8, ptr %g12, i32 2
+
+  store half 0xH0000, ptr %a6, align 16
+  store half 0xH0000, ptr %b7, align 2
+  store half 0xH0000, ptr %c8, align 2
+  store half 0xH0000, ptr %d9, align 2
+  store half 0xH0000, ptr %e10, align 8
+  store half 0xH0000, ptr %f11, align 2
+  store half 0xH0000, ptr %g12, align 2
+  store half 0xH0000, ptr %h13, align 2
+  ret void
+}
+
+define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) {
+; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752
+; CHECK-NEXT:    br [[DOTPREHEADER11_PREHEADER:label %.*]]
+; CHECK:       [[_PREHEADER11_PREHEADER:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2
+; CHECK-NEXT:    br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]]
+; CHECK:       [[_LR_PH:.*:]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7
+; CHECK-NEXT:    store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2
+; CHECK-NEXT:    br [[DOTEXIT_POINT]]
+; CHECK:       [[_EXIT_POINT:.*:]]
+; CHECK-NEXT:    ret void
+;
+  %base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752
+  br label %.preheader11.preheader
+
+.preheader11.preheader:
+  %base2 = shl nuw nsw i32 %arg0, 6
+  %base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2
+
+  %base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2
+  %base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3
+
+  %cmp = icmp sgt i32 %arg0, 2
+  br i1 %cmp, label %.lr.ph, label %.exit_point
+
+.lr.ph:
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4
+
+  %dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5
+  %dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2
+  %dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10
+  %dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14
+
+  store half %arg6_half, ptr addrspace(3) %dst, align 2
+  store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2
+  store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2
+  store half %arg7_half, ptr addrspace(3) %dst_off14, align 2
+  br label %.exit_point
+
+.exit_point:
+  ret void
+}
+
+; The regression test for merging equivalence classes. It is reduced and adapted
+; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at
+; post-commit checks with memory sanitizer on the initial attempt to implement
+; the merging of the equivalence classes.
+define void @variadics1(ptr %vlist) {
+; CHECK-LABEL: define void @variadics1(
+; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8
+; CHECK-NEXT:    [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
+; CHECK-NEXT:    [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
+; CHECK-NEXT:    [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[X5:%.*]] = fadd double [[X42]], [[X31]]
+; CHECK-NEXT:    store double [[X5]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+  %argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0)
+  %argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8
+  %x0 = getelementptr i8, ptr %argp.next8, i32 7
+  %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0)
+  %argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8
+  %x2 = getelementptr i8, ptr %argp.next12, i32 7
+  %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0)
+  %x3 = load double, ptr %argp.cur16.aligned, align 8
+  %argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8
+  %x4 = load double, ptr %argp.cur16.aligned_off8, align 8
+  %x5 = fadd double %x4, %x3
+  store double %x5, ptr null, align 8
+  ret void
+}
+
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
index 64e8a6be998e..7a3817d3a13c 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-unknown-unknown -print-lsr-output < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=aarch64 -stop-after=loop-reduce < %s | FileCheck %s
 
 declare void @foo(i64)
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
index 8761122f756f..e6b5991d8dfb 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64-nvidia-cuda"
 ; That would be worthless, because "i" is simulated by two 32-bit registers and
 ; truncating it to 32-bit is as simple as directly using the register that
 ; contains the low bits.
-define void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
+define ptx_kernel void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
 ; CHECK-LABEL: @trunc_is_free(
 entry:
   %cmp.4 = icmp eq i64 %begin, %end
@@ -41,5 +41,3 @@ for.body:                                         ; preds = %for.body.preheader,
 
 declare void @_Z3usei(i32)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @trunc_is_free, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
index 63149adfa215..d9b890788a12 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
@@ -1,67 +1,57 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "estimated cost.*frem" --version 4
-
+; REQUIRES: asserts
 ; RUN: opt -mattr=+neon -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-NO-VECLIB
-
 ; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-NO-VECLIB
-
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-ARMPL
-
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL-TAILFOLD
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF-TAILFOLD
 
-; REQUIRES: asserts
-
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f64'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f64'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f64'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f64'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f64'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f64'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
@@ -84,58 +74,58 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f32'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f32'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f32'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f32'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f32'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f32'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index ddf6c1005e05..254cdf2d14d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -209,6 +209,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
@@ -218,7 +219,6 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], [[TMP3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> splat (i8 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index 4f050877bd13..e63155b024c4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -80,15 +80,13 @@ define void @powi_call(ptr %P) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -102,7 +100,7 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:    store double [[POWI]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -233,6 +231,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ac7f1478cf68..5b77ced73bce 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -13,9 +13,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
@@ -99,9 +99,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 867355952caf..caa98d766a8c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -821,11 +821,11 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 8
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
-; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 03de9acaf499..88b14b18c158 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -233,8 +233,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 [[START]], [[N_VEC3]]
@@ -409,8 +409,8 @@ define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index f7a1eb455fc1..a939969af852 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index cbf9bf08c2a2..8c5d84e6981b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -91,8 +91,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,11 +117,11 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    br i1 [[CMP_N11]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END6]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL11]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 1
@@ -151,9 +151,9 @@ exit:
   ret void
 }
 
-define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
+define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.ptr, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
 ; CHECK-LABEL: define void @test_exit_branch_cost(
-; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]], i32 [[Y:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-SAME: ptr [[DST:%.*]], ptr noalias [[X_PTR:%.*]], ptr noalias [[Y_PTR:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -172,11 +172,11 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
 ; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX21]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
@@ -184,161 +184,101 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[CONFLICT_RDX41:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
 ; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX41]], [[FOUND_CONFLICT26]]
 ; CHECK-NEXT:    [[BOUND028:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND129:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]]
-; CHECK-NEXT:    [[CONFLICT_RDX31:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
+; CHECK-NEXT:    [[CONFLICT_RDX65:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
 ; CHECK-NEXT:    [[BOUND032:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND133:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT34:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
-; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX31]], [[FOUND_CONFLICT34]]
+; CHECK-NEXT:    [[FOUND_CONFLICT68:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
+; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX65]], [[FOUND_CONFLICT68]]
 ; CHECK-NEXT:    [[BOUND036:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND137:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT38:%.*]] = and i1 [[BOUND036]], [[BOUND137]]
 ; CHECK-NEXT:    [[CONFLICT_RDX39:%.*]] = or i1 [[CONFLICT_RDX35]], [[FOUND_CONFLICT38]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX39]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT40:%.*]] = insertelement <2 x i1> poison, i1 [[C_3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT41:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT40]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT56:%.*]] = insertelement <2 x i1> poison, i1 [[C_4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT57:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT56]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE74:.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE55:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7:![0-9]+]], !noalias [[META10:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
 ; CHECK:       [[PRED_STORE_IF42]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE43]]
 ; CHECK:       [[PRED_STORE_CONTINUE43]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
 ; CHECK:       [[PRED_STORE_IF44]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE45]]
 ; CHECK:       [[PRED_STORE_CONTINUE45]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
 ; CHECK:       [[PRED_STORE_IF46]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE47]]
 ; CHECK:       [[PRED_STORE_CONTINUE47]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
-; CHECK:       [[PRED_STORE_IF48]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
-; CHECK:       [[PRED_STORE_CONTINUE49]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
-; CHECK:       [[PRED_STORE_IF50]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
-; CHECK:       [[PRED_STORE_CONTINUE51]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
-; CHECK:       [[PRED_STORE_IF52]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
-; CHECK:       [[PRED_STORE_CONTINUE53]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55:.*]]
-; CHECK:       [[PRED_STORE_IF54]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
-; CHECK:       [[PRED_STORE_CONTINUE55]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = or <2 x i1> [[TMP47]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i1> [[TMP47]], [[TMP21]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP20]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[PREDPHI58:%.*]] = select <2 x i1> [[TMP21]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP22]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF59:.*]], label %[[PRED_STORE_CONTINUE60:.*]]
-; CHECK:       [[PRED_STORE_IF59]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE60]]
-; CHECK:       [[PRED_STORE_CONTINUE60]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP22]], i32 1
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF61:.*]], label %[[PRED_STORE_CONTINUE62:.*]]
-; CHECK:       [[PRED_STORE_IF61]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store i64 [[TMP27]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE62]]
-; CHECK:       [[PRED_STORE_CONTINUE62]]:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF63:.*]], label %[[PRED_STORE_CONTINUE64:.*]]
-; CHECK:       [[PRED_STORE_IF63]]:
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
+; CHECK:       [[PRED_STORE_IF48]]:
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 0
-; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE64]]
-; CHECK:       [[PRED_STORE_CONTINUE64]]:
+; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
+; CHECK:       [[PRED_STORE_CONTINUE49]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF65:.*]], label %[[PRED_STORE_CONTINUE66:.*]]
-; CHECK:       [[PRED_STORE_IF65]]:
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
+; CHECK:       [[PRED_STORE_IF50]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP31]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE66]]
-; CHECK:       [[PRED_STORE_CONTINUE66]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP32]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
+; CHECK:       [[PRED_STORE_CONTINUE51]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[TMP33]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = or <2 x i1> [[TMP22]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = or <2 x i1> [[TMP23]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF67:.*]], label %[[PRED_STORE_CONTINUE68:.*]]
-; CHECK:       [[PRED_STORE_IF67]]:
-; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP45]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE68]]
-; CHECK:       [[PRED_STORE_CONTINUE68]]:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF69:.*]], label %[[PRED_STORE_CONTINUE70:.*]]
-; CHECK:       [[PRED_STORE_IF69]]:
-; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP39]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE70]]
-; CHECK:       [[PRED_STORE_CONTINUE70]]:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF71:.*]], label %[[PRED_STORE_CONTINUE72:.*]]
-; CHECK:       [[PRED_STORE_IF71]]:
-; CHECK-NEXT:    [[TMP41:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP41]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE72]]
-; CHECK:       [[PRED_STORE_CONTINUE72]]:
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
+; CHECK:       [[PRED_STORE_IF52]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
+; CHECK-NEXT:    store i64 [[TMP24]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
+; CHECK:       [[PRED_STORE_CONTINUE53]]:
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1
-; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF73:.*]], label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_IF73]]:
-; CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP43]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_CONTINUE74]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_IF54]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
+; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_CONTINUE55]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP46]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -348,6 +288,10 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[X_GEP:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[X_GEP]], align 8
+; CHECK-NEXT:    [[Y_GEP:%.*]] = getelementptr i32, ptr [[Y_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[Y_GEP]], align 4
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[X]], 0
 ; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_4:.*]], label %[[THEN_1:.*]]
 ; CHECK:       [[THEN_1]]:
@@ -386,6 +330,10 @@ entry:
 
 loop.header:
   %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %x.gep = getelementptr i64, ptr %x.ptr, i64 %iv
+  %x = load i64, ptr %x.gep
+  %y.gep = getelementptr i32, ptr %y.ptr, i64 %iv
+  %y = load i32, ptr %y.gep
   %c1 = icmp eq i64 %x, 0
   br i1 %c1, label %then.4, label %then.1
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index d42e6af1cec0..56a468ed1310 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -819,8 +819,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index bf27f9e6be65..f9cc195e3670 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -333,10 +333,10 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index dec124b55cd4..a550f1ca14c8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -170,8 +170,8 @@ entry:
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
 ; VF_2:         Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 393ee8d30433..9dceb0167a4a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -50,12 +50,12 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP7]]
-; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST7]]
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -171,10 +171,10 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST6:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 528e202b4997..6d57f212fd88 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -105,9 +105,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
 ; CHECK-VS1-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS1-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS1-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS1-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 8
+; CHECK-VS1-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-VS1-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -127,7 +127,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS1:       [[WHILE_BODY]]:
 ; CHECK-VS1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -213,9 +213,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-VS2-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS2-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS2-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS2-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; CHECK-VS2-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VS2-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -235,7 +235,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS2:       [[WHILE_BODY]]:
 ; CHECK-VS2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -428,9 +428,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 771dd00034c7..0ff98d2abe77 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -19,8 +19,8 @@ define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0cea16d10367..3d4f7e0e4924 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -137,9 +137,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VSCALEFORTUNING2:       scalar.ph:
-; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT11:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    br label [[LOOP:%.*]]
 ; VSCALEFORTUNING2:       loop:
@@ -260,9 +260,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP29]], i32 [[TMP47]]
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
 ; PRED-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    br label [[LOOP1:%.*]]
 ; PRED:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index a426cdf08062..a83c62b04afc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -23,9 +23,9 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[IDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 6ecaff048ca3..cb4fd04d1bc4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -512,9 +512,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A2]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A1]], i32 0
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -540,9 +540,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -583,9 +583,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -607,9 +607,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -649,9 +649,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
@@ -684,9 +684,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 7d058a6ef25d..295c0655a4b4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -380,9 +380,9 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 10000, [[TMP24]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 10000, [[N_MOD_VF2]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 25d3b3fe3b83..90ef2da3d163 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -16,10 +16,10 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
@@ -82,10 +82,10 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 9b37ba588f5d..bf9562273346 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -467,9 +467,9 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -553,9 +553,9 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1248,9 +1248,9 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
@@ -1339,10 +1339,10 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP9]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP10]], splat (i64 3)
@@ -1449,9 +1449,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP8]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
@@ -1492,9 +1492,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
@@ -1548,263 +1548,5 @@ end:
   ret void
 }
 
-; Check vectorization on an interleaved load/store groups of factor 4
-
-; for (int i = 0; i < 1024; ++i) {
-;   dst[i].x = a[i].x + b[i].x;
-;   dst[i].y = a[i].y - b[i].y;
-;   dst[i].z = a[i].z << b[i].z;
-;   dst[i].t = a[i].t >> b[i].t;
-; }
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
-; CHECK-LABEL: @interleave_deinterleave(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
-; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
-; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
-; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
-; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
-; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
-; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
-; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
-; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
-; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[T]], align 4
-; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
-; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
-  store i32 %add, ptr %arrayidx5, align 4
-  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
-  %2 = load i32, ptr %y, align 4
-  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
-  %3 = load i32, ptr %y11, align 4
-  %sub = sub nsw i32 %2, %3
-  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
-  store i32 %sub, ptr %y14, align 4
-  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
-  %4 = load i32, ptr %z, align 4
-  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
-  %5 = load i32, ptr %z19, align 4
-  %shl = shl i32 %4, %5
-  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
-  store i32 %shl, ptr %z22, align 4
-  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
-  %6 = load i32, ptr %t, align 4
-  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
-  %7 = load i32, ptr %t27, align 4
-  %shr = ashr i32 %6, %7
-  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
-  store i32 %shr, ptr %t30, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond.not, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; Check vectorization on a reverse interleaved load/store groups of factor 4
-
-; for (int i = 1023; i >= 0; i--) {
-;   int a = A[i].x + i;
-;   int b = A[i].y - i;
-;   int c = A[i].z * i;
-;   int d = A[i].t << i;
-;   B[i].x = a;
-;   B[i].y = b;
-;   B[i].z = c;
-;   B[i].t = d;
-; }
-
-define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
-; CHECK-LABEL: @interleave_deinterleave_reverse(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
-; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
-; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
-; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
-; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
-;
-entry:
-  br label %for.body
-for.cond.cleanup:                                 ; preds = %for.body
-  ret void
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
-  %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
-  %load1 = load i32, ptr %x, align 4
-  %trunc = trunc i64 %indvars.iv to i32
-  %add = add nsw i32 %load1, %trunc
-  %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
-  %load2 = load i32, ptr %y, align 4
-  %sub = sub nsw i32 %load2, %trunc
-  %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
-  %load3 = load i32, ptr %z, align 4
-  %mul = mul nsw i32 %load3, %trunc
-  %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
-  %load4 = load i32, ptr %t, align 4
-  %shl = shl nuw nsw i32 %load4, %trunc
-  %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
-  store i32 %add, ptr %x5, align 4
-  %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
-  store i32 %sub, ptr %y8, align 4
-  %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
-  store i32 %mul, ptr %z5, align 4
-  %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
-  store i32 %shl, ptr %t8, align 4
-  %indvars.iv.next = add nsw i64 %indvars.iv, -1
-  %cmp = icmp sgt i64 %indvars.iv, 0
-  br i1 %cmp, label %for.body, label %for.cond.cleanup
-
-}
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
 attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index d4392bebdf37..1a281fe7c6f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -529,255 +529,3 @@ for.inc:
 for.end:
   ret void
 }
-
-; Expected to contain interleave2/deinterleave2 instructions
-;
-; void masked_strided_factor4(const unsigned char* restrict p,
-;                            unsigned char* restrict q,
-;                            unsigned char guard) {
-; for(ix=0; ix < 1024; ++ix) {
-;     if (ix > guard) {
-;         char left1 = p[4*ix];
-;         char right1 = p[4*ix + 1];
-;         char left2 = p[4*ix + 2];
-;         char right2 = p[4*ix + 3];
-;         char max1 = max(left1, right1);
-;         char max2 = max(left2, right2);
-;         q[4*ix] = max1;
-;         q[4*ix + 1] = 0 - max1;
-;         q[4*ix + 2] = max2;
-;         q[4*ix + 3] = 0 - max2;
-;     }
-; }
-;}
-define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
-; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
-; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SCALAR_TAIL_FOLDING-NEXT:  entry:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SCALAR_TAIL_FOLDING:       vector.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SCALAR_TAIL_FOLDING:       vector.body:
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; SCALAR_TAIL_FOLDING:       middle.block:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SCALAR_TAIL_FOLDING:       scalar.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALAR_TAIL_FOLDING:       for.body:
-; SCALAR_TAIL_FOLDING-NEXT:    [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; SCALAR_TAIL_FOLDING:       if.then:
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]]
-; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
-; SCALAR_TAIL_FOLDING:       for.inc:
-; SCALAR_TAIL_FOLDING-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; SCALAR_TAIL_FOLDING:       for.end:
-; SCALAR_TAIL_FOLDING-NEXT:    ret void
-;
-; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
-; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; PREDICATED_TAIL_FOLDING-NEXT:  entry:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.body:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
-; PREDICATED_TAIL_FOLDING:       middle.block:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING:       for.body:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
-; PREDICATED_TAIL_FOLDING:       if.then:
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
-; PREDICATED_TAIL_FOLDING:       for.inc:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; PREDICATED_TAIL_FOLDING:       for.end:
-; PREDICATED_TAIL_FOLDING-NEXT:    ret void
-;
-entry:
-  %conv = zext i8 %guard to i32
-  br label %for.body
-
-for.body:
-  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %cmp1 = icmp ugt i32 %ix.024, %conv
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:
-  %idx0 = shl nuw nsw i32 %ix.024, 2
-  %idx1 = add i32 %idx0, 1
-  %idx2 = add i32 %idx0, 2
-  %idx3 = add i32 %idx0, 3
-
-  %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
-  %0 = load i8, ptr %array1idx0, align 1
-  %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
-  %1 = load i8, ptr %array1idx1, align 1
-  %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
-  %2 = load i8, ptr %array1idx2, align 1
-  %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
-  %3 = load i8, ptr %array1idx3, align 1
-
-  %cmp.i1 = icmp slt i8 %0, %1
-  %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
-  %sub1 = sub i8 0, %spec.select.i1
-  %cmp.i2 = icmp slt i8 %2, %3
-  %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
-  %sub2 = sub i8 0, %spec.select.i2
-
-  %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
-  store i8 %spec.select.i1, ptr %array3idx0, align 1
-  %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
-  store i8 %sub1, ptr %array3idx1, align 1
-  %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
-  store i8 %spec.select.i2, ptr %array3idx2, align 1
-  %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
-  store i8 %sub2, ptr %array3idx3, align 1
-
-  br label %for.inc
-
-for.inc:
-  %inc = add nuw nsw i32 %ix.024, 1
-  %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 64b69be5f525..322f96f45d19 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -19,12 +19,12 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index d81cfbf08ec9..1f7d0b745f92 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -11,7 +11,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-LABEL: Checking a loop in 'simple_memset'
 ; VPLANS:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
 ; VPLANS-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; VPLANS-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; VPLANS:      vp<[[TC:%[0-9]+]]> = original trip-count
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: ir-bb<entry>:
 ; VPLANS-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 umax %n)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index e5b9812604f1..75b2df93c935 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -215,9 +215,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index de1500421a91..603bd98e8149 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -17,6 +17,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%start.1> + vp<[[VEC_TC]]> * ir<8>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%start.2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   <x1> vector loop: {
@@ -55,11 +57,12 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -150,9 +153,9 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 4b096e17a4fa..881de8dc7982 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -25,10 +25,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP26]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -138,12 +138,12 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -229,12 +229,12 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 8037a3a0c0f8..c119248c0be4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -45,10 +45,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -90,10 +91,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 8ac46fe7687d..0c246c6ee93e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -43,10 +43,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -88,10 +89,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -138,6 +140,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
@@ -183,10 +186,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -232,6 +236,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
@@ -277,10 +282,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 648f6e874abb..a119707bed12 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -41,10 +41,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -86,10 +87,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index 976c6a9a570a..551b85b7d035 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -17,8 +17,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
@@ -58,8 +58,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
@@ -99,8 +99,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12  for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
@@ -140,23 +140,23 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
@@ -181,8 +181,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_4:          Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
@@ -263,23 +263,23 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_2'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0
@@ -309,30 +309,30 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -361,30 +361,30 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0
@@ -413,9 +413,9 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
@@ -427,16 +427,16 @@ entry:
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0
@@ -465,30 +465,30 @@ entry:
 ; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_3'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0
@@ -517,9 +517,9 @@ entry:
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_4:          Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2
@@ -621,30 +621,30 @@ entry:
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_3'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0
@@ -677,37 +677,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_4:         Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_4'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0
@@ -740,37 +740,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0
@@ -803,10 +803,10 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
@@ -821,19 +821,19 @@ entry:
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0
@@ -866,37 +866,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_4'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0
@@ -1055,37 +1055,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_4'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index 79ced9a483ef..2f1af7951dbc 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -108,7 +108,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END27:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,40 +117,40 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND30:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT31:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <2 x i64> [ [[TMP55]], %[[VEC_EPILOG_PH]] ], [ [[TMP56:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX29]], 0
-; CHECK-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[NEXT_GEP33]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <2 x i8>, ptr [[TMP58]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[WIDE_LOAD34]] to <2 x i64>
-; CHECK-NEXT:    [[TMP60:%.*]] = shl <2 x i64> [[VEC_IND30]], splat (i64 1)
-; CHECK-NEXT:    [[TMP61:%.*]] = shl <2 x i64> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP56]] = or <2 x i64> [[TMP61]], [[VEC_PHI32]]
-; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT31]] = add <2 x i64> [[VEC_IND30]], splat (i64 2)
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP62]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND27:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT28:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI29:%.*]] = phi <2 x i64> [ [[TMP57]], %[[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP59:%.*]] = add i64 [[INDEX26]], 0
+; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[NEXT_GEP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[WIDE_LOAD32]] to <2 x i64>
+; CHECK-NEXT:    [[TMP62:%.*]] = shl <2 x i64> [[VEC_IND27]], splat (i64 1)
+; CHECK-NEXT:    [[TMP63:%.*]] = shl <2 x i64> [[TMP61]], [[TMP62]]
+; CHECK-NEXT:    [[TMP58]] = or <2 x i64> [[TMP63]], [[VEC_PHI29]]
+; CHECK-NEXT:    [[INDEX_NEXT32]] = add nuw i64 [[INDEX26]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT28]] = add <2 x i64> [[VEC_IND27]], splat (i64 2)
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT32]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP56]])
-; CHECK-NEXT:    [[CMP_N36:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N36]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP58]])
+; CHECK-NEXT:    [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL35:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END27]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX37:%.*]] = phi i64 [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX37]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL36]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP53]] to i64
@@ -164,7 +164,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP_I166_I]], i1 [[CMP2]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[RED_NEXT_LCSSA]], 0
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index 130ef7542cfb..3d00c228baf5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -75,17 +75,17 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[SRC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT4]], i32 8, <vscale x 2 x i1> [[TMP8]], <vscale x 2 x ptr> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 2 x i1> [[VEC_PHI]], [[TMP9]]
@@ -100,8 +100,8 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 11efac951082..f2318d6057ee 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -22,9 +22,9 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -116,9 +116,9 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -210,9 +210,9 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -314,9 +314,9 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -420,9 +420,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -534,9 +534,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -851,9 +851,9 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP18]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP20]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 16c23cd777b6..450405f19346 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -29,10 +29,10 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[START_EXT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -111,9 +111,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
@@ -334,9 +334,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
@@ -419,9 +419,9 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP9]], splat (i64 2)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
@@ -433,24 +433,24 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <vscale x 4 x i8> [[TMP16]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[TMP17]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index a7765f47180d..038e726adc24 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -432,6 +432,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -439,7 +440,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -477,6 +477,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -486,10 +487,8 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
@@ -560,6 +559,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -567,7 +567,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -605,6 +604,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -614,10 +614,8 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index dd2e75f1f5e2..58d6fd05241f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -28,8 +28,8 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 8131c7bfd752..e4425a932738 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -63,13 +63,13 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3
 ; CHECK-NEXT:    [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X_I64]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -92,11 +92,11 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ], [ [[X_I64]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL14]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL13]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_I64:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = load i64, ptr [[GEP_I64]], align 8
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext i32 [[IV_CONV]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index b1ff589fe51b..bda4839dead5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
-; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
+; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
-; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
+; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
-; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -360,42 +360,42 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -550,42 +550,42 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
-; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -740,75 +740,56 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
-; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
-; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
-; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
-; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
-; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
-; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
-; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
-; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
-; CHECK-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -843,23 +824,23 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor8(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
@@ -868,39 +849,39 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
 ; FIXED-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; FIXED-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 2
-; FIXED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; FIXED-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; FIXED-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; FIXED-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; FIXED-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; FIXED-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
-; FIXED-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -935,83 +916,64 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor8(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
-; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; SCALABLE-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
-; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
-; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
-; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
+; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
+; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
+; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
+; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
+; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
+; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
+; SCALABLE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; SCALABLE-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; SCALABLE-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SCALABLE-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SCALABLE-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SCALABLE-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SCALABLE-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SCALABLE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SCALABLE-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
-; SCALABLE-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -1046,9 +1008,9 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1118,7 +1080,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1126,94 +1088,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 8
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
-; FIXED-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4
-; FIXED-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 16
-; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
+; FIXED-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
 ; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1222,7 +1184,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1230,43 +1192,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
 ; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1301,7 +1263,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1309,94 +1271,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 4
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4
-; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8
-; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
-; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; FIXED-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP1:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1405,7 +1367,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1413,43 +1375,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
 ; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 5c5600b9cfdf..10ac870c112a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -49,28 +49,26 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -78,16 +76,16 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -118,28 +116,26 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -147,16 +143,16 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -187,28 +183,26 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -216,16 +210,16 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -256,19 +250,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -285,7 +277,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -317,19 +309,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -346,7 +336,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -390,7 +380,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -407,7 +397,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 2c19aab81251..dc63072aa795 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -38,9 +38,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
 ; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
@@ -117,9 +117,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
 ; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb60c24393df..951d833fa941 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -64,6 +64,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -92,11 +94,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -181,11 +185,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -217,8 +220,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
@@ -311,6 +314,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -339,11 +344,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -428,11 +435,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -464,8 +470,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 375278eea38f..3386a7d3972a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -14,16 +14,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -31,13 +29,13 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -73,16 +71,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -90,13 +86,13 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 8395ffd58db9..30cb33e64ecc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -86,9 +86,9 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 64)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
@@ -162,10 +162,10 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP18]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -328,7 +328,6 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -348,7 +347,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -606,7 +605,6 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -626,7 +624,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -731,12 +729,12 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]]
-; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; STRIDED:       vector.body:
 ; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index bfdcfbf5139b..f38aa11b5af8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,20 +163,14 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -194,7 +188,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,7 +287,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -314,7 +308,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -354,8 +348,7 @@ attributes #1 = { "target-features"="+64bit,+v" }
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
index 11cf832c8abb..f07aaecfa846 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
@@ -989,6 +989,62 @@ exit:
   ret void
 }
 
+; There's no @llvm.vp.log10, so don't transform it.
+define void @log10(ptr %a, ptr %b, i64 %N) {
+; IF-EVL-LABEL: define void @log10(
+; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
+; IF-EVL-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; IF-EVL:       [[EXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @log10(
+; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
+; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[EXIT]]:
+; NO-VP-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds float, ptr %b, i64 %iv
+  %0 = load float, ptr %gep, align 4
+  %cond = tail call float @llvm.log10.f32(float %0)
+  %gep9 = getelementptr inbounds float, ptr %a, i64 %iv
+  store float %cond, ptr %gep9, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.umax.i32(i32, i32)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 209c251f6a6a..f323231445aa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -20,10 +20,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -37,16 +37,16 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP18]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
@@ -121,10 +121,10 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 7efa65ed270e..a2f85b9ed4ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -25,9 +25,9 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index e7eb5778ffb9..cd1d734f00ea 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -63,11 +63,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: ir-bb<for.body>:
-; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-OUTLOOP-NEXT: No successors
@@ -113,11 +114,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.body>:
-; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-INLOOP-NEXT: No successors
@@ -159,11 +161,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb<for.body>:
-; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-OUTLOOP-NEXT: No successors
@@ -205,11 +208,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb<for.body>:
-; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-INLOOP-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index 98245fcc0f0a..2de0f7e4d401 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -19,125 +19,103 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    store i1 [[TMP9]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
 ; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
 ; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
 ; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
 ; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
 ; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
 ; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
 ; CHECK:       pred.store.if13:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
 ; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
 ; CHECK:       pred.store.if15:
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
 ; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
 ; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
 ; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
 ; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
 ; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
 ; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
 ; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
 ; CHECK:       pred.store.continue24:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
 ; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
 ; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
 ; CHECK:       pred.store.continue26:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
 ; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
 ; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
 ; CHECK:       pred.store.continue28:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
 ; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
 ; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.if31:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    store i1 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -183,7 +161,6 @@ exit:                                             ; preds = %for.body
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index d0754f1c2bb5..7b0fa644ea00 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -66,8 +66,8 @@ define void @func_21() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fcf1ba072a62..61bcbaa1fe4d 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -16,30 +16,24 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 0, 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 1, 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 2, 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 3, 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
@@ -64,14 +58,13 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[PRED_STORE_IF5]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -93,7 +86,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -125,7 +118,6 @@ exit:
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 61cae9c1b3f5..83e2f84814ad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -17,15 +17,11 @@ define void @f1() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -40,7 +36,7 @@ define void @f1() {
 ; CHECK-NEXT:    store ptr [[_TMP2]], ptr [[_TMP7]], align 8
 ; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
 ; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index f3190369ae2a..15bdbea612a7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -42,8 +42,8 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 3, [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 6a12be7da192..5c0aeb526e50 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -495,8 +495,8 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP30]], i1 false, i1 false
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ANY_OF:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANY_OF_NEXT:%.*]], [[LOOP]] ]
@@ -986,8 +986,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
new file mode 100644
index 000000000000..3d23090dd123
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @fn(ptr %nno) #0 {
+; CHECK-LABEL: define i1 @fn(
+; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 10, i64 9, i64 8, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 10, [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10)
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison)
+; CHECK-NEXT:    [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]])
+; CHECK-NEXT:    br i1 true, label [[FOR_END36:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -2, [[MIDDLE_BLOCK]] ], [ 10, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY20:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ]
+; CHECK-NEXT:    [[SUM_01:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC35]] ]
+; CHECK-NEXT:    [[REM4:%.*]] = and i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp eq i64 [[REM4]], 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    br i1 [[CMP21]], label [[IF_THEN22:%.*]], label [[FOR_INC35]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[REM27:%.*]] = urem i32 [[MUL]], 10
+; CHECK-NEXT:    br label [[FOR_INC35]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[REM27_PN:%.*]] = phi i32 [ [[REM27]], [[IF_THEN22]] ], [ [[TMP15]], [[FOR_BODY20]] ]
+; CHECK-NEXT:    [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP19_NOT]], label [[FOR_END36]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0
+; CHECK-NEXT:    ret i1 [[CMP41]]
+;
+entry:
+  br label %loop.header
+
+loop.header:                                       ; preds = %entry, %loop.latch
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop.latch ]
+  %sum.01 = phi i32 [ 0, %entry ], [ %sum.1, %loop.latch ]
+  %rem4 = and i64 %iv, 1
+  %cmp21 = icmp eq i64 %rem4, 0
+  %gep = getelementptr inbounds nuw i32, ptr %nno, i64 %iv
+  %0 = load i32, ptr %gep, align 4
+  br i1 %cmp21, label %if.then, label %loop.latch
+
+if.then:                                        ; preds = %loop.header
+  %mul = shl i32 %0, 1
+  %rem27 = urem i32 %mul, 10
+  br label %loop.latch
+
+loop.latch:                                        ; preds = %loop.header, %if.then
+  %rem27.pn = phi i32 [ %rem27, %if.then ], [ %0, %loop.header ]
+  %sum.1 = or i32 %rem27.pn, %sum.01
+  %iv.next = add nsw i64 %iv, -1
+  %cmp19.not = icmp eq i64 %iv, 0
+  br i1 %cmp19.not, label %exit, label %loop.header
+
+exit:                                        ; preds = %loop.latch
+  %sum.1.lcssa = phi i32 [ %sum.1, %loop.latch ]
+  %cmp41 = icmp eq i32 %sum.1.lcssa, 0
+  ret i1 %cmp41
+}
+
+attributes #0 = { "target-features"="+avx" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 5fb7df2c74d9..c14ddca6c913 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -60,8 +60,8 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[IV_START]], [[N_VEC4]]
@@ -171,11 +171,11 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 48
 ; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP5]], ptr [[TMP10]], align 2
-; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP21]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD_3]], [[TMP1]]
@@ -191,8 +191,8 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[L]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[L]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc i64 [[N_VEC5]] to i16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 0e511cfc9bff..6fc70802e82c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index ab0b45473a62..fc6059d036cd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -61,8 +61,8 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC3:%.*]] = and i64 [[ZEXT]], 2147483644
 ; AUTO_VEC-NEXT:    [[DOTCAST5:%.*]] = uitofp nneg i64 [[N_VEC3]] to float
 ; AUTO_VEC-NEXT:    [[TMP7:%.*]] = fmul fast float [[DOTCAST5]], 5.000000e-01
@@ -441,8 +441,8 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC6:%.*]] = and i64 [[TMP0]], 4294967292
 ; AUTO_VEC-NEXT:    [[DOTCAST8:%.*]] = uitofp nneg i64 [[N_VEC6]] to float
 ; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul reassoc float [[DOTCAST8]], 4.200000e+01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index b6bccab5c2e4..8c338d6a746c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -665,16 +665,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX512:       vec.epilog.iter.check:
-; AVX512-NEXT:    [[TMP22:%.*]] = mul i64 [[N_VEC]], 64
-; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP22]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
 ; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
+; AVX512-NEXT:    [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
+; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
 ; AVX512-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; AVX512-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; AVX512:       vec.epilog.ph:
-; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8
 ; AVX512-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]]
 ; AVX512-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4
@@ -691,12 +691,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD23]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD24]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 3b550449006f..68cbfad91c54 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -348,10 +348,10 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[VECTOR_MEMCHECK]] ], [ 100, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[VECTOR_MEMCHECK]] ], [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
new file mode 100644
index 000000000000..1dd2692ba682
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: @wide_add_induction_step_live_in(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i16 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[LOOP]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ADD_LCSSA]]
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret i16 %add
+}
+
+define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: @wide_sub_induction_step_live_in(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP10]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[SUB:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[SUB_LCSSA]]
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret i16 %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index b2772648b5ee..f50177e61ef0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -56,12 +56,12 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]]
+; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -181,10 +181,10 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 120
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 597be33ca63f..9e87cc29be4e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -368,9 +368,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 63
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -414,9 +414,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 7
 ; AUTOVF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; AUTOVF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; AUTOVF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AUTOVF-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index bb7fe4d4f1e5..1a9e7ddb965f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -14,72 +14,70 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT16]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE15:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
 ; CHECK:       [[PRED_SDIV_IF]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
 ; CHECK:       [[PRED_SDIV_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF2:.*]], label %[[PRED_SDIV_CONTINUE3:.*]]
-; CHECK:       [[PRED_SDIV_IF2]]:
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE3]]
-; CHECK:       [[PRED_SDIV_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
 ; CHECK:       [[PRED_SDIV_IF4]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE5]]
 ; CHECK:       [[PRED_SDIV_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
 ; CHECK:       [[PRED_SDIV_IF6]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE7]]
 ; CHECK:       [[PRED_SDIV_CONTINUE7]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
 ; CHECK:       [[PRED_SDIV_IF8]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE9]]
 ; CHECK:       [[PRED_SDIV_CONTINUE9]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE7]] ], [ [[TMP8]], %[[PRED_SDIV_IF8]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
 ; CHECK:       [[PRED_SDIV_IF10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE11]]
 ; CHECK:       [[PRED_SDIV_CONTINUE11]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP12]], %[[PRED_SDIV_IF10]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP8]], %[[PRED_SDIV_IF10]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
 ; CHECK:       [[PRED_SDIV_IF12]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE13]]
 ; CHECK:       [[PRED_SDIV_CONTINUE13]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP16]], %[[PRED_SDIV_IF12]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP12]], %[[PRED_SDIV_IF12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15:.*]]
 ; CHECK:       [[PRED_SDIV_IF14]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE15]]
 ; CHECK:       [[PRED_SDIV_CONTINUE15]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP20]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP16]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF16:.*]], label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_IF16]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_CONTINUE17]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE15]] ], [ [[TMP20]], %[[PRED_SDIV_IF16]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> zeroinitializer, <4 x i16> [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP24]] = or <4 x i1> [[VEC_PHI]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI1]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI3]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,8 +89,8 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index cc60359af2f8..7816c4918761 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -16,18 +16,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -43,7 +41,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 41868d62a35a..d1c0201ccb9a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -85,9 +85,9 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VEC:       scalar.ph:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    br label [[FOR_BODY:%.*]]
 ; VEC:       for.body:
 ; VEC-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IDX:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 8d56c3386a3b..cfae26a3a425 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -15,19 +15,18 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
 ; CHECK:       [[PRED_UREM_IF]]:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[MUL]], [[X]]
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
 ; CHECK:       [[PRED_UREM_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[REM]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
 ; CHECK:       [[PRED_UREM_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = urem i64 [[MUL]], [[X]]
@@ -48,7 +47,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[PRED_UREM_CONTINUE6]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP9]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 236ed30be4f1..a0294f7ac799 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -65,16 +65,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT99:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END9:%.*]] = add i64 8, [[TMP64]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF6:%.*]] = urem i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[N_VEC7:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC7]], 2
@@ -112,16 +112,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 8, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       iter.check27:
+; CHECK:       iter.check23:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH46:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK29:%.*]]
-; CHECK:       vector.main.loop.iter.check29:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH41:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK25:%.*]]
+; CHECK:       vector.main.loop.iter.check25:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH47:%.*]], label [[VECTOR_PH30:%.*]]
-; CHECK:       vector.ph30:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH42:%.*]], label [[VECTOR_PH30:%.*]]
+; CHECK:       vector.ph26:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[N_VEC32]], 2
@@ -129,16 +129,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY35:%.*]]
-; CHECK:       vector.body35:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY35]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
+; CHECK:       vector.body29:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP32]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[TMP34]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = or disjoint <16 x i64> [[VEC_IND37]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <16 x i64> [[TMP30]], [[TMP35]]
@@ -153,44 +153,44 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK24:%.*]], label [[VECTOR_BODY35]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block24:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block20:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK48:%.*]]
-; CHECK:       vec.epilog.iter.check49:
-; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
+; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
+; CHECK:       vec.epilog.iter.check43:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
+; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH46]], label [[VEC_EPILOG_PH47]]
-; CHECK:       vec.epilog.ph48:
-; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH41]], label [[VEC_EPILOG_PH42]]
+; CHECK:       vec.epilog.ph42:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
 ; CHECK-NEXT:    [[N_VEC53:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF52]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[N_VEC53]], 2
 ; CHECK-NEXT:    [[IND_END54:%.*]] = add i64 8, [[TMP43]]
 ; CHECK-NEXT:    [[IND_END57:%.*]] = mul i64 [[N_VEC53]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT50:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT50]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    [[DOTSPLATINSERT62:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL42]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT63:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT62]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION64:%.*]] = add <8 x i64> [[DOTSPLAT63]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT72:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT72]], <8 x i1> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY58:%.*]]
-; CHECK:       vec.epilog.vector.body58:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH47]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY50:%.*]]
+; CHECK:       vec.epilog.vector.body52:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[TMP48]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = or disjoint <8 x i64> [[VEC_IND70]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <8 x i64> [[TMP44]], [[TMP49]]
@@ -205,17 +205,17 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK46:%.*]], label [[VEC_EPILOG_VECTOR_BODY58]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block46:
-; CHECK-NEXT:    [[CMP_N75:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N75]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH46]]
-; CHECK:       vec.epilog.scalar.ph47:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK48]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL59:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK48]] ]
+; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY50]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block40:
+; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH41]]
+; CHECK:       vec.epilog.scalar.ph41:
+; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
 ; CHECK:       for.body.us:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH46]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL59]], [[VEC_EPILOG_SCALAR_PH46]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH41]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], [[VEC_EPILOG_SCALAR_PH41]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 55ff26c55b51..c9132bab80f1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -142,8 +142,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_114]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY15:%.*]]
-; CHECK:       vector.body15:
-; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE28:%.*]] ]
+; CHECK:       vector.body14:
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE78:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
@@ -151,7 +151,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if21:
+; CHECK:       pred.store.if20:
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
@@ -160,10 +160,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue22:
+; CHECK:       pred.store.continue21:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
+; CHECK:       pred.store.if22:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
@@ -173,10 +173,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
+; CHECK:       pred.store.continue23:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if25:
+; CHECK:       pred.store.if24:
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
@@ -186,10 +186,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
+; CHECK:       pred.store.continue25:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
-; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.if27:
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.if26:
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
@@ -198,8 +198,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.continue27:
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]]
 ; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY15]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -273,7 +273,7 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE18:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -290,38 +290,38 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
-; CHECK:       pred.store.if13:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 4
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[NEXT_GEP8]], align 16
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr [[NEXT_GEP3]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
-; CHECK:       pred.store.if15:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[OFFSET_IDX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 8
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP9]], align 16
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[NEXT_GEP4]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
-; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.if17:
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.if15:
 ; CHECK-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[OFFSET_IDX]], 12
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 12
 ; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -470,8 +470,8 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK:       pred.store.if10:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 4
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
@@ -480,11 +480,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[NEXT_GEP7]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
-; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP13:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
@@ -493,11 +493,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i16 [[TMP15]] to i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP8]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.if14:
+; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE15]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP19:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 12
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
@@ -507,7 +507,7 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
 ; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP9]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
index 4c95584ff253..2fea016218e6 100644
--- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -171,11 +171,11 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 6e7efe018faf..e11f77d8aeae 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -33,7 +33,7 @@
 ; CHECK:   br i1 {{.+}}, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]]
 ;
 ; CHECK: vec.epilog.middle.block:
-; CHECK:   br i1 %cmp.n9, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
+; CHECK:   br i1 %cmp.n{{.+}}, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
 ;
 ; CHECK: vec.epilog.scalar.ph:
 ; CHECK:   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index 04ce9562c04b..0f34f6243f15 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -11,22 +11,20 @@ define i32 @foo(ptr %p) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]]
-; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]]
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]]
+; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -63,17 +61,16 @@ exit:                              ; preds = %loop
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
-; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]])
+; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]])
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]])
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
-; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]])
-; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]])
-; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]])
-; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]])
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]}
+; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]])
+; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]])
+; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]])
+; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]])
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
+; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 8baff7b33118..1333451a0077 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -104,6 +104,83 @@ exit:
   ret void
 }
 
+define void @deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -150,7 +227,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -173,7 +250,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -251,7 +328,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -274,7 +351,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -352,7 +429,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -375,7 +452,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -453,7 +530,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -476,7 +553,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -550,7 +627,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr no
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -573,7 +650,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr no
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -653,7 +730,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr n
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -676,7 +753,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr n
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -757,7 +834,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -781,7 +858,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -854,7 +931,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -876,7 +953,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -949,7 +1026,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -971,7 +1048,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1027,7 +1104,7 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1049,7 +1126,7 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1123,7 +1200,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1145,7 +1222,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1218,7 +1295,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1240,7 +1317,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1313,7 +1390,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1335,7 +1412,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1397,4 +1474,6 @@ exit:
 ; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index c159ec868c35..94593a7d9a81 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -241,9 +241,9 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
@@ -275,8 +275,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -426,8 +426,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK:%.*]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK:%.*]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i1 [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index fe6121ca3d00..32d32a64049a 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -45,12 +45,13 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -125,13 +126,14 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
 ; CHECK-NEXT:    IR   %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -205,12 +207,13 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
@@ -282,12 +285,13 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 7aedb218e135..fc71f8a93404 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -661,10 +661,10 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
 ; CHECK-NEXT:    br i1 true, label %End, label %scalar.ph
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ]
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK:      End:
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ]
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index fe16e8ce6f97..253ecaca75be 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -41,8 +41,8 @@ define void @can_sink_after_store(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -121,8 +121,8 @@ define void @sink_sdiv(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -202,8 +202,8 @@ define void @can_sink_with_additional_user(i32 %x, ptr %ptr, i64 %tc) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -387,9 +387,9 @@ define void @instruction_with_2_FOR_operands(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[BB74:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[BB13:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[TMP60:%.*]], [[BB13]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -463,9 +463,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses(ptr noalias
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -554,9 +554,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses_chain(ptr n
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -852,8 +852,8 @@ define void @sink_dominance(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -935,8 +935,8 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -1057,9 +1057,9 @@ define void @test_for_sink_instruction_after_same_incoming_1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -1125,9 +1125,9 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index a1278907a729..0b2e7fe48439 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -82,11 +82,12 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -172,11 +173,12 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -245,12 +247,13 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -361,11 +364,12 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -458,11 +462,12 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %C = icmp sgt i32 %iv.next, %recur.next
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -503,6 +508,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -546,11 +552,12 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[END]]>, ir<2>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp ugt i64 %iv, 3
 ; CHECK-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 2d50f8219549..509b8f99e4b6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -55,8 +55,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -111,8 +111,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -165,8 +165,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[SCALAR_BODY]] ]
@@ -265,8 +265,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup.loopexit:
@@ -334,8 +334,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup.loopexit:
@@ -398,8 +398,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       for.cond.cleanup.loopexit:
@@ -525,8 +525,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[SCALAR_BODY]] ]
@@ -598,8 +598,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP21:%.*]], [[SCALAR_BODY]] ]
@@ -669,8 +669,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP15:%.*]], [[SCALAR_BODY]] ]
@@ -912,8 +912,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -956,8 +956,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1005,8 +1005,8 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1780,8 +1780,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1836,8 +1836,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1888,8 +1888,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
@@ -2016,8 +2016,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP48:%.*]], [[FOR_BODY]] ]
@@ -2076,8 +2076,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -2143,8 +2143,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ]
@@ -2240,8 +2240,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2299,8 +2299,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2353,8 +2353,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
@@ -2685,8 +2685,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2816,8 +2816,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
@@ -2881,8 +2881,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2990,8 +2990,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3189,8 +3189,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
@@ -3273,8 +3273,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3707,13 +3707,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
@@ -3726,7 +3721,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       exit:
 ; UNROLL-NO-IC-NEXT:    ret i32 0
 ;
@@ -3736,11 +3731,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3753,7 +3745,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;
@@ -3763,13 +3755,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; SINK-AFTER-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
@@ -3782,7 +3769,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; SINK-AFTER:       exit:
 ; SINK-AFTER-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c4509e4ad56e..7db53d8ffced 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -172,6 +172,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -184,7 +185,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
-; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.if:
 ; UNROLL-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
@@ -192,10 +193,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL:       pred.store.continue3:
 ; UNROLL-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NEXT:    [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NEXT:    [[TMP14:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
-; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
+; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP12]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -244,6 +243,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -256,7 +256,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP5]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP8]], ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
@@ -268,10 +268,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP11]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -321,9 +319,10 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; VEC-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
-; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0
 ; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -351,7 +350,6 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 1)
-; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index ecb00d472448..f553864b5fc3 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -1,21 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
 
 ; int int_inc;
 ;
-;int induction_with_global(int init, int *restrict A, int N) {
+;void induction_with_global(int init, int *restrict A, int N) {
 ;  int x = init;
 ;  for (int i=0;i<N;i++){
 ;    A[i] = x;
 ;    x += int_inc;
 ;  }
-;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_global(
-; CHECK:       for.body.lr.ph:
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+
+@int_inc = common global i32 0, align 4
+
+define void @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-LABEL: define void @induction_with_global(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @int_inc, align 4
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %init, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INIT]], [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INIT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
@@ -24,53 +42,56 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP10]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-
-@int_inc = common global i32 0, align 4
-
-define i32 @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP0]], [[X_05]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
   %0 = load i32, ptr @int_inc, align 4
-  %1 = mul i32 %0, %N
   br label %for.body
 
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.05, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %x.05
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %2 = add i32 %1, %init
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
-  ret i32 %x.0.lcssa
-}
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %iv.next.trunc, %N
+  br i1 %exitcond, label %exit, label %for.body
 
+exit:
+  ret void
+}
 
 ;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
 ;  int x = init;
@@ -83,82 +104,123 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %x.011, i64 0
+define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define i32 @induction_with_loop_inv(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV15:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT16:%.*]], %[[OUTER_LATCH:.*]] ]
+; CHECK-NEXT:    [[J_012:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC5:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[X_011:%.*]] = phi i32 [ [[INIT]], %[[ENTRY]] ], [ [[X_0_LCSSA:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[DOTCAST]], [[J_012]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[X_011]], [[TMP1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[X_011]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 %j.012, i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[J_012]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[J_012]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP8]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[INNER_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[X_011]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[INNER:.*]]
+; CHECK:       [[INNER]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[X_18:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_18]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[X_18]], [[J_012]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[INNER_EXIT]], label %[[INNER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[INNER_EXIT]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[X_011]], [[INDVARS_IV15]]
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[X_0_LCSSA]] = phi i32 [ [[TMP9]], %[[INNER_EXIT]] ]
+; CHECK-NEXT:    [[INC5]] = add nuw nsw i32 [[J_012]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT16]] = add i32 [[INDVARS_IV15]], [[N]]
+; CHECK-NEXT:    [[EXITCOND17:%.*]] = icmp eq i32 [[INC5]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND17]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
 entry:
-  %cmp10 = icmp sgt i32 %M, 0
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %cmp27 = icmp sgt i32 %N, 0
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
-  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
-  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
-  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
-  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
-
-for.body3.preheader:                              ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %j.012 = phi i32 [ 0, %entry ], [ %inc5, %outer.latch ]
+  %x.011 = phi i32 [ %init, %entry ], [ %x.1.lcssa, %outer.latch ]
+  br label %inner
+
+inner:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %x.18 = phi i32 [ %x.011, %outer.header ], [ %add, %inner ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.18, ptr %arrayidx, align 4
   %add = add nsw i32 %x.18, %j.012
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %inner.ec = icmp eq i32 %iv.next.trunc, %N
+  br i1 %inner.ec, label %inner.exit, label %inner
 
-for.inc4.loopexit:                                ; preds = %for.body3
-  %0 = add i32 %x.011, %indvars.iv15
-  br label %for.inc4
+inner.exit:
+  %add.ivs  = add i32 %x.011, %outer.iv
+  br label %outer.latch
 
-for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
-  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+outer.latch:
+  %x.1.lcssa = phi i32 [ %add.ivs, %inner.exit ]
   %inc5 = add nuw nsw i32 %j.012, 1
-  %indvars.iv.next16 = add i32 %indvars.iv15, %N
-  %exitcond17 = icmp eq i32 %inc5, %M
-  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
-
-for.end6.loopexit:                                ; preds = %for.inc4
-  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
-  br label %for.end6
+  %outer.iv.next = add i32 %outer.iv, %N
+  %outer.ec = icmp eq i32 %inc5, %M
+  br i1 %outer.ec, label %exit, label %outer.header
 
-for.end6:                                         ; preds = %for.end6.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
-  ret i32 %x.0.lcssa
+exit:
+  ret i32 %x.1.lcssa
 }
 
-
-; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
-; CHECK:       vector.ph:
-; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-LABEL: define void @non_primary_iv_loop_inv_trunc(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i64 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[STEP]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
@@ -166,19 +228,38 @@ for.end6:                                         ; preds = %for.end6.loopexit,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 8
-; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[J]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], [[STEP]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -197,22 +278,43 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: @iv_no_binary_op_in_descriptor(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-LABEL: define void @iv_no_binary_op_in_descriptor(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label %middle.block, label [[VECTOR_BODY]]
-
-define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT_P]] = phi i64 [ [[IV_NEXT]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT_P]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -231,3 +333,162 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_add_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[O_1]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_sub_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 28c1c2afbe08..96311de673d8 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1962,6 +1962,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -1989,7 +1990,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2030,6 +2030,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
+; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -2054,8 +2056,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.continue2:
 ; IND-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
-; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
-; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]]
 ; IND-NEXT:    [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2097,7 +2097,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i1> [[TMP27]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2143,8 +2144,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL:       pred.udiv.continue8:
 ; UNROLL-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
-; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]]
+; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
+; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]]
 ; UNROLL-NEXT:    [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NEXT:    [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2189,6 +2190,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2239,10 +2241,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP20]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
 ; UNROLL-NO-IC-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2284,7 +2284,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[TMP47:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP47]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ]
@@ -2366,8 +2367,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
 ; INTERLEAVE:       pred.udiv.continue16:
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP21]], <4 x i32> [[WIDE_LOAD]]
-; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]]
+; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP21]]
+; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]]
 ; INTERLEAVE-NEXT:    [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; INTERLEAVE-NEXT:    [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2773,8 +2774,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2844,8 +2845,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2910,8 +2911,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2981,8 +2982,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3051,8 +3052,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3122,8 +3123,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -5561,9 +5562,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 1
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -5781,9 +5782,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       exit:
@@ -6236,9 +6237,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6307,9 +6308,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
-; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
 ; IND-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6382,9 +6383,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
-; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
 ; UNROLL-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6463,9 +6464,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6538,9 +6539,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       loop:
 ; INTERLEAVE-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index bb17580ac4d1..5bc832fbd684 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -14,6 +14,7 @@
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<1>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -76,6 +77,7 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<false> + vp<[[VEC_TC]]> * ir<true>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -116,11 +118,13 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; DBG-NEXT:  EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<false>
 ; DBG-NEXT: Successor(s): ir-bb<loop.header>
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop.header>:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; DBG-NEXT:   IR   %d.next = xor i1 %d, true
 ; DBG-NEXT: No successors
 ; DBG-EMPTY:
@@ -222,11 +226,12 @@ exit:
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; DBG-NEXT:  EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; DBG-NEXT: Successor(s): ir-bb<loop>
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop>:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; DBG-NEXT:   IR   %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
 ; DBG:        IR   %ec = icmp slt i32 %iv.next.trunc, %n
 ; DBG-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 0e7a68cd47ad..abd91d33157e 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -1508,9 +1508,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index bc1c1bf04a37..e8ad6a38d742 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -134,12 +134,12 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 3
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 88be9fa1c868..aef25a05ea12 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -401,8 +401,8 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ARRAYIDX5_PROMOTED]], [[VECTOR_MEMCHECK]] ], [ [[ARRAYIDX5_PROMOTED]], [[FOR_BODY3_LR_PH]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[FOR_BODY3]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
index 07ee5892dc28..681ffe946d17 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
@@ -40,8 +40,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC1:       [[INNER_LOOP]]:
 ; CHECK-VF4IC1-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -114,8 +114,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC4:       [[INNER_LOOP]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -189,8 +189,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF1IC4:       [[INNER_LOOP]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 10b6d1f7653d..482f731afd81 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --replace-value-regex "!llvm.loop ![0-9]+" --version 5
 ; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck --check-prefixes=CHECK,VEC %s
-; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK,INTERLEAVE %s
 
 define i32 @postinc(i32 %k)  {
 ; CHECK-LABEL: define i32 @postinc(
@@ -430,6 +430,39 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i64 [[IV_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i64 @iv_scalar_steps_and_outside_users(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i64 [[IV_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -485,6 +518,42 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[IV_2_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2004, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[IV_2_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -502,3 +571,606 @@ exit:
   %iv.2.lcssa = phi i32 [ %iv.2, %loop ]
   ret i32 %iv.2.lcssa
 }
+
+define i32 @postinc_sub(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_sub(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[K]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = sub nsw i32 [[INC_PHI]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ %k, %entry ], [ %inc, %for.body ]
+  %inc = sub nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_swapped_ops(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_swapped_ops(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 1, [[INC_PHI]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 1, %inc.phi
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_not_iv_backedge_value(i32 %k)  {
+; VEC-LABEL: define i32 @postinc_not_iv_backedge_value(
+; VEC-SAME: i32 [[K:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[FOR_BODY:.*]]
+; VEC:       [[FOR_BODY]]:
+; VEC-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; VEC-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; VEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; VEC-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[FOR_END]]:
+; VEC-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+; INTERLEAVE-LABEL: define i32 @postinc_not_iv_backedge_value(
+; INTERLEAVE-SAME: i32 [[K:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[FOR_BODY:.*]]
+; INTERLEAVE:       [[FOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; INTERLEAVE-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[FOR_END]]:
+; INTERLEAVE-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %inc.2 = add i32 %inc.phi, 2
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.2
+}
+
+define float @fp_postinc_use_fadd(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fadd(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fadd(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fadd fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fsub(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fsub(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fsub fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fsub fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/121745.
+define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
+; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; VEC-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; VEC-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 1
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
+; VEC-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; VEC-NEXT:    [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]]
+; VEC-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]]
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[E_EXIT]]:
+; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[RES]]
+;
+; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; INTERLEAVE-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[E_EXIT]]:
+; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i32 %step.2, %iv
+  %cmp.i = icmp slt i32 %iv.next, 8
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
+; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; VEC-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP_2]], i64 0
+; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VEC-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; VEC-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; VEC-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; VEC-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 1)
+; VEC-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[TMP4]]
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
+; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VEC-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
+; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
+; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[E_EXIT]]:
+; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[RES]]
+;
+; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; INTERLEAVE-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; INTERLEAVE-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
+; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[E_EXIT]]:
+; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %inc = add i32 %iv, 1
+  %iv.next = add i32 %step.2, %inc
+  %cmp.i = icmp slt i32 %iv.next, 8
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 70199fa1e079..1bfb34165e52 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -500,8 +500,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
@@ -590,8 +590,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.ph:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index ed7762fbc2ba..fe660a826967 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -41,26 +41,26 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK-NEXT:    [[TMP12:%.*]] = add i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.if2:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.continue3:
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i32 -1
 ; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 4f47e66816c9..a129a4b1928c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -6,25 +6,25 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT5]], splat (i32 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ <i32 35902, i32 0>, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 9), <2 x i32> [[PREDPHI]]
 ; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
index 3dafe8270dc3..a4b229d0a96b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
@@ -52,13 +52,13 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK:       vector.ph7:
 ; CHECK-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF8]]
-; CHECK-NEXT:    br label [[VECTOR_BODY11:%.*]]
-; CHECK:       vector.body9:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY11]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY10:%.*]]
+; CHECK:       vector.body10:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY11]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block4:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]]
@@ -69,8 +69,8 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[IDXPROM_I_I61:%.*]] = and i64 [[IV761_LCSSA]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I62:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[IDXPROM_I_I61]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK22:%.*]] = icmp ult i64 [[TMP3]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH21:%.*]], label [[VECTOR_MEMCHECK14:%.*]]
-; CHECK:       vector.memcheck14:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH22:%.*]], label [[VECTOR_MEMCHECK15:%.*]]
+; CHECK:       vector.memcheck15:
 ; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[IDXPROM_I_I61]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 4
@@ -78,22 +78,22 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[BOUND017:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP16]]
 ; CHECK-NEXT:    [[BOUND118:%.*]] = icmp ult ptr [[ARRAYIDX_I_I62]], [[SCEVGEP15]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH21]], label [[VECTOR_PH23:%.*]]
-; CHECK:       vector.ph23:
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH22]], label [[VECTOR_PH24:%.*]]
+; CHECK:       vector.ph24:
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF24]]
-; CHECK-NEXT:    br label [[VECTOR_BODY26:%.*]]
-; CHECK:       vector.body26:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH23]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY26]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY27:%.*]]
+; CHECK:       vector.body27:
+; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH24]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY27]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !10, !noalias !13
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX29]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY26]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block20:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK21:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       middle.block21:
 ; CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH21]]
-; CHECK:       scalar.ph21:
-; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK20]] ], [ 0, [[VECTOR_MEMCHECK14]] ], [ 0, [[LOOP_3_LR_PH]] ]
+; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH22]]
+; CHECK:       scalar.ph22:
+; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[VECTOR_MEMCHECK15]] ], [ 0, [[LOOP_3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.2:
 ; CHECK-NEXT:    [[IV846:%.*]] = phi i64 [ [[IV_NEXT85:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL13]], [[SCALAR_PH5]] ]
@@ -105,7 +105,7 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[EXITCOND92_NOT:%.*]] = icmp eq i64 [[IV846]], [[IV]]
 ; CHECK-NEXT:    br i1 [[EXITCOND92_NOT]], label [[LOOP_3_LR_PH]], label [[LOOP_2]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       loop.3:
-; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH21]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
+; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH22]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I_I62]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I_I653:%.*]] = getelementptr i32, ptr [[TMP2:%.*]], i64 [[IV93:%.*]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index 50e18070a5c3..24b9441749ee 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -46,7 +46,7 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[ADD3_LCSSA]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY7:%.*]]
-; CHECK:       vector.body5:
+; CHECK:       vector.body4:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY7]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index bb8e19e3175f..7b57624e05e7 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -91,7 +91,7 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; DEBUGLOC:   = trunc i64 %index to i32, !dbg [[CASTLOC:![0-9]+]]
 ;
 ; DEBUGLOC: loop:
-; DEBUGLOC-NOT:   %trunc.iv = trunc i64 %iv to i32, !dbg [[CASTLOC]]
+; DEBUGLOC:   %trunc.iv = trunc i64 %iv to i32, !dbg [[CASTLOC]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
index 69e3e07ed313..3216c9233ea2 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
@@ -27,7 +27,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i16> [[BROADCAST_SPLAT]], [[VEC_PHI]]
@@ -36,12 +36,12 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
-; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1, !alias.scope [[META6:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[HEIGHT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -51,7 +51,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[HBUF]], align 1
 ; CHECK-NEXT:    [[INC]] = add i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[HEIGHT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index 240421341626..ad2f9c608f20 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -733,8 +733,8 @@ define i32 @cond-uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi i32 [ [[ADD2:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -897,8 +897,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1039,8 +1039,8 @@ define i32 @uncond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1168,8 +1168,8 @@ define i32 @uncond_cond_uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[ADD3:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 8a8439fca439..ca971f15e487 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -89,13 +89,13 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[PREDPHI]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 94fce866785d..f136b0e2e0b3 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -185,10 +185,10 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
@@ -262,11 +262,11 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[DOTCAST]], 2.000000e+00
 ; CHECK-NEXT:    [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = uitofp <vscale x 4 x i32> [[TMP7]] to <vscale x 4 x float>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul <vscale x 4 x float> [[TMP8]], splat (float 2.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 301526cf3070..550e52d31823 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1006,11 +1006,11 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    [[TMP2]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1048,6 +1048,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1055,13 +1056,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[TMP8]] = or <4 x i1> [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1100,6 +1097,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1107,13 +1105,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP1]]
-; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index 5e2819299769..836115f38138 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -17,8 +17,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[EXTRA_ITER]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[EXTRA_ITER]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -39,8 +39,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 7590bb9d6868..4ba9cc661313 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -281,12 +281,12 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 276
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
index 40b007eff8ff..57bc7b833724 100644
--- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
@@ -19,8 +19,8 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
index 31732f027f6d..892ddccbc93b 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
@@ -10,12 +10,12 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 1a8f29e672f1..b427b43cdb13 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -47,10 +47,11 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<e2>:
@@ -129,10 +130,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
@@ -204,10 +206,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
new file mode 100644
index 000000000000..252061335e73
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
@@ -0,0 +1,52 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug %s 2>&1 | FileCheck %s
+
+
+; CHECK-LABEL: LV: Checking a loop in 'latch_exit_cannot_compute_btc_due_to_step'
+; CHECK: 	   LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Early exit is not the latch predecessor.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-LABEL: LV: Checking a loop in 'header_exit_cannot_compute_btc_due_to_step'
+; CHECK:       LV: Found an induction variable.
+; CHECK-NEXT:  LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Cannot determine exact exit count for latch block.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-NOT: vector.body
+define void @latch_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop
+
+loop:                                   ; preds = %loop, %for.cond.us
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @header_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, %step
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop.latch, label %exit
+
+loop.latch:
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  br label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 72011ca7f484..7f5e0f3a77ef 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -133,11 +133,11 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fd75177c0d10..85b44a7076d1 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1,19 +1,138 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF8UF1 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF8UF2 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF16UF1 %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; Check if the vector loop condition can be simplified to true for a given
 ; VF/IC combination.
 define void @test_tc_less_than_16(ptr %A, i64 %N) {
-; CHECK-LABEL: define void @test_tc_less_than_16(
-; VF8UF1:       [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec
-; VF8UF1-NEXT:  br i1 [[CMP]], label %middle.block, label %vector.body
+; VF8UF1-LABEL: define void @test_tc_less_than_16(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8
+; VF8UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
 ;
-; VF8UF2:       br i1 true, label %middle.block, label %vector.body
+; VF8UF2-LABEL: define void @test_tc_less_than_16(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF2-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
 ;
-; VF16UF1:      br i1 true, label %middle.block, label %vector.body
+; VF16UF1-LABEL: define void @test_tc_less_than_16(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF16UF1-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
 ;
 entry:
   %and = and i64 %N, 15
@@ -33,3 +152,1118 @@ loop:
 exit:
   ret void
 }
+
+define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, 10) %N) {
+; VF8UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF2-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP5]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP7]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP9]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP11]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP13]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP15]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP17]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP19]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP21]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP23]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP25]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP27]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP29]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP31]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP33]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP35]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF2-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF16UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
+; VF16UF1-NEXT:    [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP20]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP22]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP24]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP26]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP28]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP30]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP32]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP34]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF16UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias %src, ptr %dst) {
+; VF8UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF1:       [[OUTER_HEADER]]:
+; VF8UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF1-NEXT:    br label %[[INNER:.*]]
+; VF8UF1:       [[INNER]]:
+; VF8UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1:       [[OUTER_LATCH]]:
+; VF8UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF2-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF2:       [[OUTER_HEADER]]:
+; VF8UF2-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; VF8UF2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF2-NEXT:    br label %[[INNER:.*]]
+; VF8UF2:       [[INNER]]:
+; VF8UF2-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF2-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF2-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2:       [[OUTER_LATCH]]:
+; VF8UF2-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF2-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF2-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF16UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF16UF1:       [[OUTER_HEADER]]:
+; VF16UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF16UF1-NEXT:    br label %[[INNER:.*]]
+; VF16UF1:       [[INNER]]:
+; VF16UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF16UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF16UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1:       [[OUTER_LATCH]]:
+; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF16UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF16UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi ptr [ %src, %entry ], [ %outer.iv.next, %outer.latch ]
+  br label %inner
+
+inner:
+  %inner.iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %gep.src = getelementptr i8, ptr %outer.iv, i64 %inner.iv
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %inner.iv
+  store i8 %l, ptr %gep.dst, align 1
+  %iv.next = add i64 %inner.iv, 1
+  %c.1 = icmp eq i64 %iv.next, %N
+  br i1 %c.1, label %outer.latch, label %inner
+
+outer.latch:
+  %outer.iv.next = getelementptr i8, ptr %outer.iv, i64 1
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %outer.header, label %exit
+
+exit:
+  ret void
+}
+
+declare void @llvm.assume(i1)
+
+; Test case for https://github.com/llvm/llvm-project/issues/121897.
+define void @scev_expand_step(i64 %x, ptr %dst) {
+; VF8UF1-LABEL: define void @scev_expand_step(
+; VF8UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF8UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF8UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF8UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF8UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF8UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF8UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF8UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF8UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF8UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF8UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP29:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF8UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF8UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF8UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF8UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP39:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF8UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF8UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @scev_expand_step(
+; VF8UF2-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF2-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF2-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF2-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF2-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = mul i64 0, [[STEP]]
+; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 0, [[TMP6]]
+; VF8UF2-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP9]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = mul i64 1, [[STEP]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
+; VF8UF2-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP14]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = mul i64 2, [[STEP]]
+; VF8UF2-NEXT:    [[TMP17:%.*]] = add i64 0, [[TMP16]]
+; VF8UF2-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = mul i64 3, [[STEP]]
+; VF8UF2-NEXT:    [[TMP22:%.*]] = add i64 0, [[TMP21]]
+; VF8UF2-NEXT:    [[TMP23:%.*]] = add i64 [[TMP22]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = mul i64 4, [[STEP]]
+; VF8UF2-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP26]]
+; VF8UF2-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP29]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = mul i64 5, [[STEP]]
+; VF8UF2-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP31]]
+; VF8UF2-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP33]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP34]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP35]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP36:%.*]] = mul i64 6, [[STEP]]
+; VF8UF2-NEXT:    [[TMP37:%.*]] = add i64 0, [[TMP36]]
+; VF8UF2-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP39]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP41:%.*]] = mul i64 7, [[STEP]]
+; VF8UF2-NEXT:    [[TMP42:%.*]] = add i64 0, [[TMP41]]
+; VF8UF2-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP44]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP4]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP45]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP46:%.*]] = mul i64 8, [[STEP]]
+; VF8UF2-NEXT:    [[TMP47:%.*]] = add i64 0, [[TMP46]]
+; VF8UF2-NEXT:    [[TMP48:%.*]] = add i64 [[TMP47]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP49]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP50:%.*]] = extractelement <8 x i1> [[TMP4]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP50]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP51:%.*]] = mul i64 9, [[STEP]]
+; VF8UF2-NEXT:    [[TMP52:%.*]] = add i64 0, [[TMP51]]
+; VF8UF2-NEXT:    [[TMP53:%.*]] = add i64 [[TMP52]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP53]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP54]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP55:%.*]] = extractelement <8 x i1> [[TMP4]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP56:%.*]] = mul i64 10, [[STEP]]
+; VF8UF2-NEXT:    [[TMP57:%.*]] = add i64 0, [[TMP56]]
+; VF8UF2-NEXT:    [[TMP58:%.*]] = add i64 [[TMP57]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP58]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP59]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP60:%.*]] = extractelement <8 x i1> [[TMP4]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP61:%.*]] = mul i64 11, [[STEP]]
+; VF8UF2-NEXT:    [[TMP62:%.*]] = add i64 0, [[TMP61]]
+; VF8UF2-NEXT:    [[TMP63:%.*]] = add i64 [[TMP62]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP63]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP64]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP65:%.*]] = extractelement <8 x i1> [[TMP4]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP65]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP66:%.*]] = mul i64 12, [[STEP]]
+; VF8UF2-NEXT:    [[TMP67:%.*]] = add i64 0, [[TMP66]]
+; VF8UF2-NEXT:    [[TMP68:%.*]] = add i64 [[TMP67]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP68]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP69]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP70:%.*]] = extractelement <8 x i1> [[TMP4]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP71:%.*]] = mul i64 13, [[STEP]]
+; VF8UF2-NEXT:    [[TMP72:%.*]] = add i64 0, [[TMP71]]
+; VF8UF2-NEXT:    [[TMP73:%.*]] = add i64 [[TMP72]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP73]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP74]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP75:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP75]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP76:%.*]] = mul i64 14, [[STEP]]
+; VF8UF2-NEXT:    [[TMP77:%.*]] = add i64 0, [[TMP76]]
+; VF8UF2-NEXT:    [[TMP78:%.*]] = add i64 [[TMP77]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP78]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP79]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP80:%.*]] = extractelement <8 x i1> [[TMP4]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP80]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP81:%.*]] = mul i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP82:%.*]] = add i64 0, [[TMP81]]
+; VF8UF2-NEXT:    [[TMP83:%.*]] = add i64 [[TMP82]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP83]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP84]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF2-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @scev_expand_step(
+; VF16UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF16UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF16UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF16UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF16UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = extractelement <16 x i1> [[TMP3]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF16UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF16UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP3]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF16UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF16UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP3]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF16UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF16UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP3]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF16UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF16UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP3]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF16UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF16UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP3]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF16UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF16UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP3]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF16UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF16UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP3]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF16UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF16UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP3]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP45:%.*]] = mul i64 8, [[STEP]]
+; VF16UF1-NEXT:    [[TMP46:%.*]] = add i64 0, [[TMP45]]
+; VF16UF1-NEXT:    [[TMP47:%.*]] = add i64 [[TMP46]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP47]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP48]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP49:%.*]] = extractelement <16 x i1> [[TMP3]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP49]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP50:%.*]] = mul i64 9, [[STEP]]
+; VF16UF1-NEXT:    [[TMP51:%.*]] = add i64 0, [[TMP50]]
+; VF16UF1-NEXT:    [[TMP52:%.*]] = add i64 [[TMP51]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP52]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP53]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP54:%.*]] = extractelement <16 x i1> [[TMP3]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP54]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP55:%.*]] = mul i64 10, [[STEP]]
+; VF16UF1-NEXT:    [[TMP56:%.*]] = add i64 0, [[TMP55]]
+; VF16UF1-NEXT:    [[TMP57:%.*]] = add i64 [[TMP56]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP57]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP58]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP59:%.*]] = extractelement <16 x i1> [[TMP3]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP59]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP60:%.*]] = mul i64 11, [[STEP]]
+; VF16UF1-NEXT:    [[TMP61:%.*]] = add i64 0, [[TMP60]]
+; VF16UF1-NEXT:    [[TMP62:%.*]] = add i64 [[TMP61]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP62]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP63]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP3]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP65:%.*]] = mul i64 12, [[STEP]]
+; VF16UF1-NEXT:    [[TMP66:%.*]] = add i64 0, [[TMP65]]
+; VF16UF1-NEXT:    [[TMP67:%.*]] = add i64 [[TMP66]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP67]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP68]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP69:%.*]] = extractelement <16 x i1> [[TMP3]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP69]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP70:%.*]] = mul i64 13, [[STEP]]
+; VF16UF1-NEXT:    [[TMP71:%.*]] = add i64 0, [[TMP70]]
+; VF16UF1-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP72]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP73]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP74:%.*]] = extractelement <16 x i1> [[TMP3]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP74]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP75:%.*]] = mul i64 14, [[STEP]]
+; VF16UF1-NEXT:    [[TMP76:%.*]] = add i64 0, [[TMP75]]
+; VF16UF1-NEXT:    [[TMP77:%.*]] = add i64 [[TMP76]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP77]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP78]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP79:%.*]] = extractelement <16 x i1> [[TMP3]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP79]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP80:%.*]] = mul i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP81:%.*]] = add i64 0, [[TMP80]]
+; VF16UF1-NEXT:    [[TMP82:%.*]] = add i64 [[TMP81]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP82]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP83]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF16UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  %c = icmp eq i64 %x, 65536
+  call void @llvm.assume(i1 %c)
+  %fr = freeze i64 %x
+  %step = add i64 %fr, -65534
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv.next
+  store i8 0, ptr %gep.dst, align 1
+  %ec = icmp slt i64 %iv.next, 16
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+;.
+; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+;.
+; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 930d3cd41d31..791c995d88c1 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -499,16 +499,13 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 0, [[G_64]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 0, [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3
 ; CHECK-NEXT:    store <4 x i16> splat (i16 -1), ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -521,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -558,6 +555,5 @@ exit:
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 95855e84c46e..be1eb78cab60 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -35,10 +35,11 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop.header>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 484e1ea8de0d..dd3b50b3e060 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -14,7 +14,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: ir-bb<vector.ph>:
 ; CHECK-NEXT:   IR %n.mod.vf = urem i64 %0, 2
 ; CHECK-NEXT:   IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:   IR %ind.end = getelementptr i8, ptr %start, i64 %n.vec
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + ir<%n.vec> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -94,7 +94,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi ir<%ind.end>, ir<%start>
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index f07d1af47af0..beb305f23884 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -19,6 +19,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT:   vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + vp<[[VTC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -43,11 +45,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:        IR   %cmp = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -57,7 +61,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ;
 ; CHECK: Executing best plan with VF=8, UF=2
 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
-; CHECK-NEXT: Live-in ir<[[VFxUF:.+]]> = VF * UF
 ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
@@ -69,29 +72,23 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: ir-bb<vector.ph>:
 ; CHECK-NEXT:  IR   %n.mod.vf = urem i64 %and, 16
 ; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
-; CHECK-NEXT:  IR   %ind.end = sub i64 %and, %n.vec
-; CHECK-NEXT:  IR   %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1>
+; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:     vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
-; CHECK-NEXT:     vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR1]]>
-; CHECK-NEXT:     WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
-; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
-; CHECK-NEXT:     WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
-; CHECK-NEXT:     vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR3]]>, ir<%add>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR4]]>, ir<%add>.1
-; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, ir<[[VFxUF]]>
-; CHECK-NEXT:     EMIT branch-on-cond ir<true>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
+; CHECK-NEXT:   vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[VPTR1]]>
+; CHECK-NEXT:   WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
+; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:   WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
+; CHECK-NEXT:   vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT: Successor(s): ir-bb<middle.block>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<middle.block>:
@@ -103,8 +100,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%and>
-; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end1>, ir<%A>
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index cc2bd4e12744..5c09ce22cc8f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -42,10 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -112,10 +113,11 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -180,11 +182,12 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -246,11 +249,12 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %red = phi float [ %red.next, %for.body ], [ 0.000000e+00, %entry ]
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
@@ -332,10 +336,11 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %cmp = icmp ult i64 %i, 5
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -381,6 +386,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * ir<4>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -412,10 +418,11 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp = icmp slt i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -494,12 +501,13 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
+; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] (extra operand: vp<[[RED_RESUME]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -588,10 +596,11 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp1 = icmp slt i32 %lsd, 100
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -648,6 +657,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * vp<[[EXP_SCEV]]>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -671,10 +681,11 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb<loop.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, %inc
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -738,10 +749,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ec = icmp eq i32 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -805,6 +817,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
@@ -873,10 +886,11 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -961,10 +975,11 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ifcond = fcmp oeq float %ld.value, 5.0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1038,10 +1053,11 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1152,11 +1168,12 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
index a939b1e923a9..b6391e045769 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
@@ -16,6 +16,7 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<0> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -53,11 +54,13 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
-; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %tmp5 = trunc i32 %tmp4 to i8
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 534345152cb5..aa05bb153966 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -262,6 +262,7 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<21> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.A.uniform> = getelementptr inbounds ir<%A>, ir<0>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1046,6 +1047,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -1086,10 +1088,11 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%n>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv.next = add nsw i32 %iv, -1
 ; CHECK-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
 ; CHECK-NEXT:   IR   %l = load i32, ptr %gep.src, align 16
@@ -1134,6 +1137,7 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -1177,10 +1181,11 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 -1
 ; CHECK-NEXT:   IR   %l = load i8, ptr %ptr.iv.next, align 1
 ; CHECK-NEXT:   IR   %c.1 = icmp eq i8 %l, 0
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 39b90adc74ef..65d78f4199aa 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -803,6 +803,19 @@ define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) {
   ret void
 }
 
+define void @byval_param_profile_metadata(ptr align 4 byval(i32) %ptr) {
+; CHECK-LABEL: @byval_param_profile_metadata(
+; CHECK-NEXT:    store i32 1, ptr [[PTR2:%.*]], align 4
+; CHECK-NEXT:    call void @f_byval(ptr byval(i32) align 4 [[PTR2]]), !prof [[PROF3:![0-9]+]], !memprof [[META4:![0-9]+]], !callsite [[META7:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca i32, align 4
+  store i32 1, ptr %ptr
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %ptr, i64 4, i1 false)
+  call void @f_byval(ptr align 4 byval(i32) %tmp), !memprof !3, !callsite !6, !prof !7
+  ret void
+}
+
 define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) {
 ; CHECK-LABEL: @memcpy_memory_none(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[SIZE:%.*]], i1 false) #[[ATTR7:[0-9]+]]
@@ -897,3 +910,8 @@ define void @memcpy_immut_escape_after(ptr align 4 noalias %val) {
 !0 = !{!0}
 !1 = !{!1, !0}
 !2 = !{!1}
+!3 = !{!4}
+!4 = !{!5, !"cold"}
+!5 = !{i64 123, i64 456}
+!6 = !{i64 123}
+!7 = !{!"branch_weights", i32 10}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
new file mode 100644
index 000000000000..759d5115896c
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
@@ -0,0 +1,159 @@
+;; Test recursion handling during cloning.
+;;
+;; Original code looks like:
+;;
+;; #include <stdlib.h>
+;; #include <string.h>
+;; #include <unistd.h>
+;; __attribute((noinline)) char *D() {
+;;   return new char[10];
+;; }
+;; __attribute((noinline)) char *B(int n);
+;; __attribute((noinline)) char *C(int n) {
+;;   if (!n) {
+;;     return D();
+;;   }
+;;   return B(n-1);
+;; }
+;; __attribute((noinline)) char *B(int n) {
+;;   return C(n);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(1);
+;;   char *y = B(1);
+;;   char *z = B(0);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   free(x);
+;;   sleep(200);
+;;   free(y);
+;;   free(z);
+;;   return 0;
+;; }
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold" \
+; RUN:  --check-prefix=ALL
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+;; We should call the original B for the recursive context if we have
+;; disabled recursive contexts via -memprof-allow-recursive-contexts=false.
+; SKIP-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:18:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi assigned to call function clone _Z1Ci
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci assigned to call function clone _Z1Dv
+; ALL: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
new file mode 100644
index 000000000000..9f3e09d7420d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -0,0 +1,806 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+%struct.cmplx_int16_t = type { i16, i16 }
+%struct.compressed_data_8bit = type { i8, [24 x i8] }
+
+define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(
+; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr nocapture noundef readonly [[SRC:%.*]], ptr nocapture noundef writeonly [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0
+; CHECK-NEXT:    br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64
+; CHECK-NEXT:    br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]]
+; CHECK-NEXT:    [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
+; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16
+; CHECK-NEXT:    [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1
+; CHECK-NEXT:    [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]]
+; CHECK-NEXT:    [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]]
+; CHECK-NEXT:    [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]]
+; CHECK-NEXT:    store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1
+; CHECK-NEXT:    [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1
+; CHECK-NEXT:    [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i16 [[CONV]], 1
+; CHECK-NEXT:    [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]]
+; CHECK-NEXT:    [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]]
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]]
+; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %__p0.addr.i75 = alloca <8 x i16>, align 16
+  %__p1.addr.i76 = alloca i16, align 2
+  %__ret.i77 = alloca <8 x i16>, align 16
+  %.compoundliteral.i78 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <8 x i16>, align 16
+  %__p1.addr.i63 = alloca i16, align 2
+  %__ret.i64 = alloca <8 x i16>, align 16
+  %.compoundliteral.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i60 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca i16, align 2
+  %__ret.i61 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i57 = alloca <8 x i8>, align 8
+  %__ret.i58 = alloca <8 x i16>, align 16
+  %__p0.addr.i54 = alloca <8 x i8>, align 8
+  %__ret.i55 = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i8>, align 8
+  %__ret.i = alloca <8 x i16>, align 16
+  %n_prb.addr = alloca i32, align 4
+  %src.addr = alloca ptr, align 8
+  %dst.addr = alloca ptr, align 8
+  %scale.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %prb_comp_in = alloca [3 x <8 x i16>], align 16
+  %__ret = alloca <8 x i8>, align 8
+  %tmp = alloca <8 x i8>, align 8
+  %__ret3 = alloca <8 x i8>, align 8
+  %tmp8 = alloca <8 x i8>, align 8
+  %__ret11 = alloca <8 x i8>, align 8
+  %tmp16 = alloca <8 x i8>, align 8
+  %prb_decomp = alloca [3 x <8 x i16>], align 16
+  %scaling_factor = alloca i16, align 2
+  %__s1 = alloca <8 x i16>, align 16
+  %agg.tmp = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp.coerce = alloca i64, align 8
+  %__s135 = alloca <8 x i16>, align 16
+  %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp37.coerce = alloca i64, align 8
+  %__s140 = alloca <8 x i16>, align 16
+  %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp42.coerce = alloca i64, align 8
+  %__s145 = alloca <8 x i16>, align 16
+  %__s148 = alloca <8 x i16>, align 16
+  %__s151 = alloca <8 x i16>, align 16
+  store i32 %n_prb, ptr %n_prb.addr, align 4
+  store ptr %src, ptr %src.addr, align 8
+  store ptr %dst, ptr %dst.addr, align 8
+  store ptr %scale, ptr %scale.addr, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n_prb.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %src.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %3 to i64
+  %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom
+  %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0
+  %4 = load <8 x i8>, ptr %arrayidx1, align 1
+  store <8 x i8> %4, ptr %__ret, align 8
+  %5 = load <8 x i8>, ptr %__ret, align 8
+  store <8 x i8> %5, ptr %tmp, align 8
+  %6 = load <8 x i8>, ptr %tmp, align 8
+  store <8 x i8> %6, ptr %__p0.addr.i57, align 8
+  %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8
+  %vmovl.i59 = sext <8 x i8> %7 to <8 x i16>
+  store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16
+  %8 = load <8 x i16>, ptr %__ret.i58, align 16
+  %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  store <8 x i16> %8, ptr %arrayidx2, align 16
+  %9 = load ptr, ptr %src.addr, align 8
+  %10 = load i32, ptr %i, align 4
+  %idxprom4 = zext i32 %10 to i64
+  %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4
+  %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1
+  %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8
+  %11 = load <8 x i8>, ptr %arrayidx7, align 1
+  store <8 x i8> %11, ptr %__ret3, align 8
+  %12 = load <8 x i8>, ptr %__ret3, align 8
+  store <8 x i8> %12, ptr %tmp8, align 8
+  %13 = load <8 x i8>, ptr %tmp8, align 8
+  store <8 x i8> %13, ptr %__p0.addr.i54, align 8
+  %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8
+  %vmovl.i56 = sext <8 x i8> %14 to <8 x i16>
+  store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16
+  %15 = load <8 x i16>, ptr %__ret.i55, align 16
+  %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  store <8 x i16> %15, ptr %arrayidx10, align 16
+  %16 = load ptr, ptr %src.addr, align 8
+  %17 = load i32, ptr %i, align 4
+  %idxprom12 = zext i32 %17 to i64
+  %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12
+  %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1
+  %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16
+  %18 = load <8 x i8>, ptr %arrayidx15, align 1
+  store <8 x i8> %18, ptr %__ret11, align 8
+  %19 = load <8 x i8>, ptr %__ret11, align 8
+  store <8 x i8> %19, ptr %tmp16, align 8
+  %20 = load <8 x i8>, ptr %tmp16, align 8
+  store <8 x i8> %20, ptr %__p0.addr.i, align 8
+  %21 = load <8 x i8>, ptr %__p0.addr.i, align 8
+  %vmovl.i = sext <8 x i8> %21 to <8 x i16>
+  store <8 x i16> %vmovl.i, ptr %__ret.i, align 16
+  %22 = load <8 x i16>, ptr %__ret.i, align 16
+  %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  store <8 x i16> %22, ptr %arrayidx18, align 16
+  %23 = load ptr, ptr %src.addr, align 8
+  %24 = load i32, ptr %i, align 4
+  %idxprom19 = zext i32 %24 to i64
+  %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19
+  %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0
+  %25 = load i8, ptr %exp, align 1
+  %conv = sext i8 %25 to i32
+  %mul = mul nsw i32 %conv, 2
+  %conv21 = trunc i32 %mul to i16
+  store i16 %conv21, ptr %scaling_factor, align 2
+  %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  %26 = load <8 x i16>, ptr %arrayidx22, align 16
+  %27 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %26, ptr %__p0.addr.i75, align 16
+  store i16 %27, ptr %__p1.addr.i76, align 2
+  %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16
+  %29 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0
+  %30 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1
+  %31 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2
+  %32 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3
+  %33 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4
+  %34 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5
+  %35 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6
+  %36 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7
+  store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16
+  %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16
+  %mul.i87 = mul <8 x i16> %28, %37
+  store <8 x i16> %mul.i87, ptr %__ret.i77, align 16
+  %38 = load <8 x i16>, ptr %__ret.i77, align 16
+  %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  store <8 x i16> %38, ptr %arrayidx24, align 16
+  %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  %39 = load <8 x i16>, ptr %arrayidx25, align 16
+  %40 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %39, ptr %__p0.addr.i62, align 16
+  store i16 %40, ptr %__p1.addr.i63, align 2
+  %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16
+  %42 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0
+  %43 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1
+  %44 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2
+  %45 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3
+  %46 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4
+  %47 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5
+  %48 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6
+  %49 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7
+  store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16
+  %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16
+  %mul.i74 = mul <8 x i16> %41, %50
+  store <8 x i16> %mul.i74, ptr %__ret.i64, align 16
+  %51 = load <8 x i16>, ptr %__ret.i64, align 16
+  %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  store <8 x i16> %51, ptr %arrayidx27, align 16
+  %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  %52 = load <8 x i16>, ptr %arrayidx28, align 16
+  %53 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %52, ptr %__p0.addr.i60, align 16
+  store i16 %53, ptr %__p1.addr.i, align 2
+  %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16
+  %55 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0
+  %56 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1
+  %57 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2
+  %58 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3
+  %59 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4
+  %60 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5
+  %61 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6
+  %62 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  %mul.i = mul <8 x i16> %54, %63
+  store <8 x i16> %mul.i, ptr %__ret.i61, align 16
+  %64 = load <8 x i16>, ptr %__ret.i61, align 16
+  %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  store <8 x i16> %64, ptr %arrayidx30, align 16
+  %65 = load ptr, ptr %scale.addr, align 8
+  %cmp31 = icmp ne ptr %65, null
+  br i1 %cmp31, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %66 = load <8 x i16>, ptr %arrayidx32, align 16
+  %67 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false)
+  %68 = load i64, ptr %agg.tmp.coerce, align 8
+  %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68)
+  store <8 x i16> %call33, ptr %__s1, align 16
+  %69 = load ptr, ptr %dst.addr, align 8
+  %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0
+  %70 = load <8 x i16>, ptr %__s1, align 16
+  %71 = bitcast <8 x i16> %70 to <16 x i8>
+  %72 = bitcast <16 x i8> %71 to <8 x i16>
+  store <8 x i16> %72, ptr %arrayidx34, align 2
+  %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %73 = load <8 x i16>, ptr %arrayidx36, align 16
+  %74 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false)
+  %75 = load i64, ptr %agg.tmp37.coerce, align 8
+  %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75)
+  store <8 x i16> %call38, ptr %__s135, align 16
+  %76 = load ptr, ptr %dst.addr, align 8
+  %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4
+  %77 = load <8 x i16>, ptr %__s135, align 16
+  %78 = bitcast <8 x i16> %77 to <16 x i8>
+  %79 = bitcast <16 x i8> %78 to <8 x i16>
+  store <8 x i16> %79, ptr %arrayidx39, align 2
+  %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %80 = load <8 x i16>, ptr %arrayidx41, align 16
+  %81 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false)
+  %82 = load i64, ptr %agg.tmp42.coerce, align 8
+  %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82)
+  store <8 x i16> %call43, ptr %__s140, align 16
+  %83 = load ptr, ptr %dst.addr, align 8
+  %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8
+  %84 = load <8 x i16>, ptr %__s140, align 16
+  %85 = bitcast <8 x i16> %84 to <16 x i8>
+  %86 = bitcast <16 x i8> %85 to <8 x i16>
+  store <8 x i16> %86, ptr %arrayidx44, align 2
+  %87 = load ptr, ptr %dst.addr, align 8
+  %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12
+  store ptr %add.ptr, ptr %dst.addr, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %88 = load <8 x i16>, ptr %arrayidx46, align 16
+  store <8 x i16> %88, ptr %__s145, align 16
+  %89 = load ptr, ptr %dst.addr, align 8
+  %90 = load <8 x i16>, ptr %__s145, align 16
+  %91 = bitcast <8 x i16> %90 to <16 x i8>
+  %92 = bitcast <16 x i8> %91 to <8 x i16>
+  store <8 x i16> %92, ptr %89, align 2
+  %93 = load ptr, ptr %dst.addr, align 8
+  %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4
+  store ptr %add.ptr47, ptr %dst.addr, align 8
+  %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %94 = load <8 x i16>, ptr %arrayidx49, align 16
+  store <8 x i16> %94, ptr %__s148, align 16
+  %95 = load ptr, ptr %dst.addr, align 8
+  %96 = load <8 x i16>, ptr %__s148, align 16
+  %97 = bitcast <8 x i16> %96 to <16 x i8>
+  %98 = bitcast <16 x i8> %97 to <8 x i16>
+  store <8 x i16> %98, ptr %95, align 2
+  %99 = load ptr, ptr %dst.addr, align 8
+  %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4
+  store ptr %add.ptr50, ptr %dst.addr, align 8
+  %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %100 = load <8 x i16>, ptr %arrayidx52, align 16
+  store <8 x i16> %100, ptr %__s151, align 16
+  %101 = load ptr, ptr %dst.addr, align 8
+  %102 = load <8 x i16>, ptr %__s151, align 16
+  %103 = bitcast <8 x i16> %102 to <16 x i8>
+  %104 = bitcast <16 x i8> %103 to <8 x i16>
+  store <8 x i16> %104, ptr %101, align 2
+  %105 = load ptr, ptr %dst.addr, align 8
+  %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4
+  store ptr %add.ptr53, ptr %dst.addr, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %106 = load i32, ptr %i, align 4
+  %inc = add i32 %106, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 {
+; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(
+; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
+; CHECK-NEXT:    [[VBSL5_I:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
+; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
+; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I61]]
+;
+entry:
+  %__p0.addr.i102 = alloca <4 x i32>, align 16
+  %__p1.addr.i103 = alloca <4 x i16>, align 8
+  %__p2.addr.i104 = alloca <4 x i16>, align 8
+  %__ret.i105 = alloca <4 x i32>, align 16
+  %__p0.addr.i98 = alloca <4 x i32>, align 16
+  %__p1.addr.i99 = alloca <4 x i16>, align 8
+  %__p2.addr.i100 = alloca <4 x i16>, align 8
+  %__ret.i101 = alloca <4 x i32>, align 16
+  %__p0.addr.i95 = alloca <8 x i16>, align 16
+  %__ret.i96 = alloca <4 x i16>, align 8
+  %__p0.addr.i92 = alloca <8 x i16>, align 16
+  %__ret.i93 = alloca <4 x i16>, align 8
+  %__p0.addr.i89 = alloca <8 x i16>, align 16
+  %__ret.i90 = alloca <4 x i16>, align 8
+  %__p0.addr.i86 = alloca <8 x i16>, align 16
+  %__ret.i87 = alloca <4 x i16>, align 8
+  %__p0.addr.i83 = alloca <8 x i16>, align 16
+  %__ret.i84 = alloca <4 x i16>, align 8
+  %__p0.addr.i80 = alloca <8 x i16>, align 16
+  %__ret.i81 = alloca <4 x i16>, align 8
+  %__p0.addr.i77 = alloca <8 x i16>, align 16
+  %__ret.i78 = alloca <4 x i16>, align 8
+  %__p0.addr.i74 = alloca <8 x i16>, align 16
+  %__ret.i75 = alloca <4 x i16>, align 8
+  %__p0.addr.i69 = alloca <4 x i16>, align 8
+  %__p1.addr.i70 = alloca <4 x i16>, align 8
+  %__ret.i71 = alloca <4 x i32>, align 16
+  %__p0.addr.i66 = alloca <4 x i16>, align 8
+  %__p1.addr.i67 = alloca <4 x i16>, align 8
+  %__ret.i68 = alloca <4 x i32>, align 16
+  %__p0.addr.i64 = alloca <4 x i32>, align 16
+  %__ret.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <4 x i32>, align 16
+  %__ret.i63 = alloca <8 x i16>, align 16
+  %__p0.addr.i58 = alloca <8 x i16>, align 16
+  %__p1.addr.i59 = alloca <8 x i16>, align 16
+  %__ret.i60 = alloca <8 x i16>, align 16
+  %__p0.addr.i51 = alloca <4 x i32>, align 16
+  %__p1.addr.i52 = alloca <8 x i16>, align 16
+  %__p2.addr.i53 = alloca <8 x i16>, align 16
+  %__ret.i54 = alloca <4 x i32>, align 16
+  %a.addr.i46 = alloca <4 x i32>, align 16
+  %b.addr.i47 = alloca <8 x i16>, align 16
+  %c.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i40 = alloca <8 x i16>, align 16
+  %__p1.addr.i41 = alloca <8 x i16>, align 16
+  %__ret.i42 = alloca <4 x i32>, align 16
+  %a.addr.i = alloca <8 x i16>, align 16
+  %b.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i38 = alloca <8 x i16>, align 16
+  %__ret.i39 = alloca <8 x i16>, align 16
+  %__p0.addr.i36 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca <8 x i16>, align 16
+  %__p2.addr.i = alloca <8 x i16>, align 16
+  %__ret.i37 = alloca <8 x i16>, align 16
+  %__p0.addr.i29 = alloca i32, align 4
+  %__ret.i30 = alloca <4 x i32>, align 16
+  %.compoundliteral.i31 = alloca <4 x i32>, align 16
+  %__p0.addr.i27 = alloca <4 x i32>, align 16
+  %__ret.i28 = alloca <8 x i16>, align 16
+  %__p0.addr.i16 = alloca i16, align 2
+  %__ret.i17 = alloca <8 x i16>, align 16
+  %.compoundliteral.i18 = alloca <8 x i16>, align 16
+  %__p0.addr.i14 = alloca i16, align 2
+  %__ret.i15 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i16>, align 16
+  %__ret.i = alloca <8 x i16>, align 16
+  %scale = alloca %struct.cmplx_int16_t, align 2
+  %a.addr = alloca <8 x i16>, align 16
+  %a_rev = alloca <8 x i16>, align 16
+  %cc = alloca <8 x i16>, align 16
+  %dd = alloca <8 x i16>, align 16
+  %mult_mask = alloca <8 x i16>, align 16
+  %lo32 = alloca <4 x i32>, align 16
+  %hi32 = alloca <4 x i32>, align 16
+  %coerce.val.ii = trunc i64 %scale.coerce to i32
+  store i32 %coerce.val.ii, ptr %scale, align 2
+  store <8 x i16> %a, ptr %a.addr, align 16
+  %0 = load <8 x i16>, ptr %a.addr, align 16
+  store <8 x i16> %0, ptr %__p0.addr.i, align 16
+  %1 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %2 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  store <8 x i16> %shuffle.i, ptr %__ret.i, align 16
+  %3 = load <8 x i16>, ptr %__ret.i, align 16
+  store <8 x i16> %3, ptr %a_rev, align 16
+  %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0
+  %4 = load i16, ptr %re, align 2
+  store i16 %4, ptr %__p0.addr.i16, align 2
+  %5 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0
+  %6 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1
+  %7 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2
+  %8 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3
+  %9 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4
+  %10 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5
+  %11 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6
+  %12 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7
+  store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16
+  %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16
+  store <8 x i16> %13, ptr %__ret.i17, align 16
+  %14 = load <8 x i16>, ptr %__ret.i17, align 16
+  store <8 x i16> %14, ptr %cc, align 16
+  %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1
+  %15 = load i16, ptr %im, align 2
+  store i16 %15, ptr %__p0.addr.i14, align 2
+  %16 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0
+  %17 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1
+  %18 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2
+  %19 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3
+  %20 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4
+  %21 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5
+  %22 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6
+  %23 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  store <8 x i16> %24, ptr %__ret.i15, align 16
+  %25 = load <8 x i16>, ptr %__ret.i15, align 16
+  store <8 x i16> %25, ptr %dd, align 16
+  store i32 65535, ptr %__p0.addr.i29, align 4
+  %26 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0
+  %27 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1
+  %28 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2
+  %29 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3
+  store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16
+  %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16
+  store <4 x i32> %30, ptr %__ret.i30, align 16
+  %31 = load <4 x i32>, ptr %__ret.i30, align 16
+  store <4 x i32> %31, ptr %__p0.addr.i27, align 16
+  %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16
+  %33 = bitcast <4 x i32> %32 to <8 x i16>
+  store <8 x i16> %33, ptr %__ret.i28, align 16
+  %34 = load <8 x i16>, ptr %__ret.i28, align 16
+  store <8 x i16> %34, ptr %mult_mask, align 16
+  %35 = load <8 x i16>, ptr %mult_mask, align 16
+  %36 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %36, ptr %__p0.addr.i38, align 16
+  %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16
+  %38 = bitcast <8 x i16> %37 to <16 x i8>
+  %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37)
+  %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8>
+  store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16
+  %39 = load <8 x i16>, ptr %__ret.i39, align 16
+  %40 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %35, ptr %__p0.addr.i36, align 16
+  store <8 x i16> %39, ptr %__p1.addr.i, align 16
+  store <8 x i16> %40, ptr %__p2.addr.i, align 16
+  %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16
+  %42 = bitcast <8 x i16> %41 to <16 x i8>
+  %43 = load <8 x i16>, ptr %__p1.addr.i, align 16
+  %44 = bitcast <8 x i16> %43 to <16 x i8>
+  %45 = load <8 x i16>, ptr %__p2.addr.i, align 16
+  %46 = bitcast <8 x i16> %45 to <16 x i8>
+  %vbsl3.i = and <8 x i16> %41, %43
+  %47 = xor <8 x i16> %41, splat (i16 -1)
+  %vbsl4.i = and <8 x i16> %47, %45
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16
+  %48 = load <8 x i16>, ptr %__ret.i37, align 16
+  store <8 x i16> %48, ptr %dd, align 16
+  %49 = load <8 x i16>, ptr %a.addr, align 16
+  %50 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %49, ptr %a.addr.i, align 16
+  store <8 x i16> %50, ptr %b.addr.i, align 16
+  %51 = load <8 x i16>, ptr %a.addr.i, align 16
+  store <8 x i16> %51, ptr %__p0.addr.i83, align 16
+  %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8
+  %54 = load <4 x i16>, ptr %__ret.i84, align 8
+  %55 = load <8 x i16>, ptr %b.addr.i, align 16
+  store <8 x i16> %55, ptr %__p0.addr.i80, align 16
+  %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8
+  %58 = load <4 x i16>, ptr %__ret.i81, align 8
+  store <4 x i16> %54, ptr %__p0.addr.i69, align 8
+  store <4 x i16> %58, ptr %__p1.addr.i70, align 8
+  %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8
+  %60 = bitcast <4 x i16> %59 to <8 x i8>
+  %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8
+  %62 = bitcast <4 x i16> %61 to <8 x i8>
+  %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61)
+  %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16
+  %63 = load <4 x i32>, ptr %__ret.i71, align 16
+  store <4 x i32> %63, ptr %lo32, align 16
+  %64 = load <8 x i16>, ptr %a.addr, align 16
+  %65 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %64, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %65, ptr %__p1.addr.i41, align 16
+  %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %66, ptr %__p0.addr.i95, align 16
+  %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8
+  %69 = load <4 x i16>, ptr %__ret.i96, align 8
+  %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16
+  store <8 x i16> %70, ptr %__p0.addr.i92, align 16
+  %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8
+  %73 = load <4 x i16>, ptr %__ret.i93, align 8
+  store <4 x i16> %69, ptr %__p0.addr.i66, align 8
+  store <4 x i16> %73, ptr %__p1.addr.i67, align 8
+  %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8
+  %75 = bitcast <4 x i16> %74 to <8 x i8>
+  %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8
+  %77 = bitcast <4 x i16> %76 to <8 x i8>
+  %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76)
+  %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16
+  %78 = load <4 x i32>, ptr %__ret.i68, align 16
+  store <4 x i32> %78, ptr %__ret.i42, align 16
+  %79 = load <4 x i32>, ptr %__ret.i42, align 16
+  store <4 x i32> %79, ptr %hi32, align 16
+  %80 = load <4 x i32>, ptr %lo32, align 16
+  %81 = load <8 x i16>, ptr %a_rev, align 16
+  %82 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %80, ptr %a.addr.i46, align 16
+  store <8 x i16> %81, ptr %b.addr.i47, align 16
+  store <8 x i16> %82, ptr %c.addr.i, align 16
+  %83 = load <4 x i32>, ptr %a.addr.i46, align 16
+  %84 = load <8 x i16>, ptr %b.addr.i47, align 16
+  store <8 x i16> %84, ptr %__p0.addr.i77, align 16
+  %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8
+  %87 = load <4 x i16>, ptr %__ret.i78, align 8
+  %88 = load <8 x i16>, ptr %c.addr.i, align 16
+  store <8 x i16> %88, ptr %__p0.addr.i74, align 16
+  %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8
+  %91 = load <4 x i16>, ptr %__ret.i75, align 8
+  store <4 x i32> %83, ptr %__p0.addr.i102, align 16
+  store <4 x i16> %87, ptr %__p1.addr.i103, align 8
+  store <4 x i16> %91, ptr %__p2.addr.i104, align 8
+  %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16
+  %93 = bitcast <4 x i32> %92 to <16 x i8>
+  %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8
+  %95 = bitcast <4 x i16> %94 to <8 x i8>
+  %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8
+  %97 = bitcast <4 x i16> %96 to <8 x i8>
+  %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96)
+  %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106)
+  store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16
+  %98 = load <4 x i32>, ptr %__ret.i105, align 16
+  store <4 x i32> %98, ptr %lo32, align 16
+  %99 = load <4 x i32>, ptr %hi32, align 16
+  %100 = load <8 x i16>, ptr %a_rev, align 16
+  %101 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %99, ptr %__p0.addr.i51, align 16
+  store <8 x i16> %100, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %101, ptr %__p2.addr.i53, align 16
+  %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16
+  %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %103, ptr %__p0.addr.i89, align 16
+  %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8
+  %106 = load <4 x i16>, ptr %__ret.i90, align 8
+  %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16
+  store <8 x i16> %107, ptr %__p0.addr.i86, align 16
+  %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8
+  %110 = load <4 x i16>, ptr %__ret.i87, align 8
+  store <4 x i32> %102, ptr %__p0.addr.i98, align 16
+  store <4 x i16> %106, ptr %__p1.addr.i99, align 8
+  store <4 x i16> %110, ptr %__p2.addr.i100, align 8
+  %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16
+  %112 = bitcast <4 x i32> %111 to <16 x i8>
+  %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8
+  %114 = bitcast <4 x i16> %113 to <8 x i8>
+  %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8
+  %116 = bitcast <4 x i16> %115 to <8 x i8>
+  %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115)
+  %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i)
+  store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16
+  %117 = load <4 x i32>, ptr %__ret.i101, align 16
+  store <4 x i32> %117, ptr %__ret.i54, align 16
+  %118 = load <4 x i32>, ptr %__ret.i54, align 16
+  store <4 x i32> %118, ptr %hi32, align 16
+  %119 = load <4 x i32>, ptr %lo32, align 16
+  store <4 x i32> %119, ptr %__p0.addr.i64, align 16
+  %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16
+  %121 = bitcast <4 x i32> %120 to <8 x i16>
+  store <8 x i16> %121, ptr %__ret.i65, align 16
+  %122 = load <8 x i16>, ptr %__ret.i65, align 16
+  %123 = load <4 x i32>, ptr %hi32, align 16
+  store <4 x i32> %123, ptr %__p0.addr.i62, align 16
+  %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16
+  %125 = bitcast <4 x i32> %124 to <8 x i16>
+  store <8 x i16> %125, ptr %__ret.i63, align 16
+  %126 = load <8 x i16>, ptr %__ret.i63, align 16
+  store <8 x i16> %122, ptr %__p0.addr.i58, align 16
+  store <8 x i16> %126, ptr %__p1.addr.i59, align 16
+  %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16
+  %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16
+  %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16
+  %129 = load <8 x i16>, ptr %__ret.i60, align 16
+  ret <8 x i16> %129
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 20.0.0git"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+;.
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index 6c45442bdcd3..13ea35a87c31 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index b2d6455a8294..a38413f26ec9 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 82b1cf90b472..f583a616dd37 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target triple = "arm64-apple-darwin"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index 2f61c89241fd..801a8a05d82e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S -o - %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx14.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index b14a36c8b3bc..2703d2390ce5 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 7fccfeea39bb..886e7a758d05 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -enable-matrix -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -enable-matrix -S %s | FileCheck %s
 
 target triple = "arm64-apple-ios"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
index 10b07ad6e749..d34063824fe7 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt %s -mtriple=arm64-apple-macosx -passes='default<O3>' -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-macosx -passes="default<O3>" -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
 
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 declare ptr @__memcpy_chk(ptr, ptr, i64, i64)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
deleted file mode 100644
index b400b27df083..000000000000
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-; for (int i = 0; i < 1024; ++i) {
-;   dst[i].x = a[i].x + b[i].x;
-;   dst[i].y = a[i].y - b[i].y;
-;   dst[i].z = a[i].z << b[i].z;
-;   dst[i].t = a[i].t >> b[i].t;
-; }
-
-define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
-; CHECK-LABEL: @interleave_deinterleave(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[LDN9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP13]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 3
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP21]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
-; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[Y]], align 4
-; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]]
-; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
-; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[Z]], align 4
-; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
-; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
-; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[T]], align 4
-; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[T27]], align 4
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
-; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
-  store i32 %add, ptr %arrayidx5, align 4
-  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
-  %2 = load i32, ptr %y, align 4
-  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
-  %3 = load i32, ptr %y11, align 4
-  %sub = sub nsw i32 %2, %3
-  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
-  store i32 %sub, ptr %y14, align 4
-  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
-  %4 = load i32, ptr %z, align 4
-  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
-  %5 = load i32, ptr %z19, align 4
-  %shl = shl i32 %4, %5
-  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
-  store i32 %shl, ptr %z22, align 4
-  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
-  %6 = load i32, ptr %t, align 4
-  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
-  %7 = load i32, ptr %t27, align 4
-  %shr = ashr i32 %6, %7
-  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
-  store i32 %shr, ptr %t30, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond.not, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index b1d0c70abc55..76d9d14b7d14 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 5b7622bca6b5..2ab6f2bdca8d 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck --check-prefix=OLDPM %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck --check-prefix=NEWPM %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck --check-prefix=NEWPM %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index aab787b2b2d2..778f25f5620f 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index ef651010d6a9..9032c363eb93 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; This test after a lot of cleanup should produce pick a tail-predicated 8x
 ; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
@@ -15,7 +15,7 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK:       while.body.preheader:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER18:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
@@ -48,18 +48,18 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER18]]
-; CHECK:       while.body.preheader18:
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
+; CHECK:       while.body.preheader15:
 ; CHECK-NEXT:    [[BLKCNT_06_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCA_ADDR_05_PH:%.*]] = phi ptr [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_04_PH:%.*]] = phi ptr [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03_PH:%.*]] = phi ptr [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER18]] ]
+; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER15]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[PSRCA_ADDR_05]], i32 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
index 6cbba5ca2a6a..664953a71c9a 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                   | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 57a3d8175ba5..5a3742c7d85b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -unroll-runtime -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -unroll-runtime -S %s | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 798824bce4da..a4aea02a3351 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
@@ -78,30 +78,16 @@ define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8i16_u1234567(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; SSE4-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @add_v8i16_u1234567(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:    ret <8 x i16> [[RESULT]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -172,13 +158,10 @@ define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_u123(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -202,13 +185,10 @@ define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_0u23(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -232,40 +212,28 @@ define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: @add_v4i32_01u3(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v4i32_01u3(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE4-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
 ; SSE4-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4i32_01u3(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4i32_01u3(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -289,13 +257,10 @@ define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_012u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -420,46 +385,30 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
 
 define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE2-LABEL: @add_v8i32_01234u67(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE2-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v8i32_01234u67(
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE4-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8i32_01234u67(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; AVX-NEXT:    ret <8 x i32> [[RESULT]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -530,13 +479,10 @@ define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_u123(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -560,13 +506,10 @@ define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_0u23(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -589,41 +532,11 @@ define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: @add_v4f32_01u3(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
-;
-; SSE4-LABEL: @add_v4f32_01u3(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
-;
-; AVX2-LABEL: @add_v4f32_01u3(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
-;
-; AVX512-LABEL: @add_v4f32_01u3(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
-; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-LABEL: @add_v4f32_01u3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -647,39 +560,27 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: @add_v4f32_012u(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4f32_012u(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4f32_012u(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4f32_012u(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 ; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
@@ -804,33 +705,40 @@ define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @add_v8f32_012u4567(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:    ret <8 x float> [[RESULT]]
+; SSE2-LABEL: @add_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8f32_012u4567(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 14, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:    ret <8 x float> [[RESULT]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -983,13 +891,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_u123(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 1, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1034,13 +939,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_0u23(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1085,13 +987,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_01u3(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1136,13 +1035,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_012u(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
new file mode 100644
index 000000000000..bcb316a4a73e
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -0,0 +1,1141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sub_v8i16_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @sub_v8i16_u1234567(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT:    [[A23:%.*]] = sub i16 [[A2]], [[A3]]
+; SSE2-NEXT:    [[A45:%.*]] = sub i16 [[A4]], [[A5]]
+; SSE2-NEXT:    [[A67:%.*]] = sub i16 [[A6]], [[A7]]
+; SSE2-NEXT:    [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT:    [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
+; SSE2-NEXT:    [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8i16_u1234567(
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
+;
+; AVX-LABEL: @sub_v8i16_u1234567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @sub_v4i32_01u3(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4i32_01u3(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4i32_01u3(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4i32_01u3(
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_012u(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @sub_v8i32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @sub_v8i32_01234u67(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8i32_01234u67(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01u3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_012u(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f32_012u(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4f32_012u(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4f32_012u(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @sub_v8f32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: @sub_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8f32_012u4567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_u1(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_0u(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+  ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @sub_v4f64_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_u123(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_u123(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_u123(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_0u23(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_0u23(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01u3(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_01u3(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_01u3(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_012u(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE2-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_012u(
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_012u(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f64_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @sub_v4f64_uu23(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[RESULT1]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f64_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @sub_v4f64_01uu(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x double> %result
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index 4e1051d1991a..d92df9741644 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -31,13 +31,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[SHUFFLE]]
 ;
 ; AVX-LABEL: @PR50392(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[SHUFFLE]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index fe5bba1ae977..7bb22e2e9f5b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
index 2843a7e76123..126be0252062 100644
--- a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
+++ b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 ; These are tests that check for set/clear bits in a bitfield based on PR37098:
 ; https://bugs.llvm.org/show_bug.cgi?id=37098
diff --git a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
index 7ff3c5dc5536..7cdddd1e4232 100644
--- a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: opt -S -passes='lto<O3>' < %s | FileCheck %s --check-prefixes=CHECK,LTO
 
 declare void @llvm.trap()
diff --git a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
index 689f4a9798a7..641f21630214 100644
--- a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
+++ b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s --check-prefixes=ALL,O3
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=ALL,O3
 ; RUN: opt -passes='default<O2>' -S < %s  | FileCheck %s --check-prefixes=ALL,O2
 ; RUN: opt -passes='default<O1>' -S < %s  | FileCheck %s --check-prefixes=ALL,O1
 
diff --git a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
index 2139542c3d3b..82a453d72079 100644
--- a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
+++ b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
index ba6c36abd24a..cc20233a5fef 100644
--- a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; This is based on the following most basic C++ code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
index 21fa234535ca..1239b18c0701 100644
--- a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -passes='default<O0>' -S | FileCheck %s --check-prefixes=CHECK,NOOPT
 ; RUN: opt < %s -passes='default<O1>' -S | FileCheck %s --check-prefixes=CHECK,OPT
 ; RUN: opt < %s -passes='default<O2>' -S | FileCheck %s --check-prefixes=CHECK,OPT
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s --check-prefixes=CHECK,OPT
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
index bae3e269b6f0..c6b5e5f3ccae 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
 
 ; This example is produced from a very basic C code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/pr32544.ll b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
index 421260b10231..135084dbbc7d 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr32544.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 define void @foo(i1 %which, i32 %a, i32 %b, ptr %result) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr45682.ll b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
index 22305806a19c..46ee19178e35 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr45682.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 define void @PR45682(i32 %x, i32 %y) {
 ; CHECK-LABEL: @PR45682(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr62311.ll b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
index 03276d83d8a0..027df7da6a74 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr62311.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; C++ version of test case
 ; #include <x86intrin.h>
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index 016460fed7c3..6941ea2aece9 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 ; Make sure that interaction of "writable" with various passes does not
 ; result in the elimination of the store prior to @j().
diff --git a/llvm/test/Transforms/PhaseOrdering/rotate.ll b/llvm/test/Transforms/PhaseOrdering/rotate.ll
index 9ce196941df9..9179edc4281e 100644
--- a/llvm/test/Transforms/PhaseOrdering/rotate.ll
+++ b/llvm/test/Transforms/PhaseOrdering/rotate.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 ; This should become a single funnel shift through a combination
 ; of aggressive-instcombine, simplifycfg, and instcombine.
diff --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
index 03df1384ca40..9da46bde96ef 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes='default<O1>' -S < %s | FileCheck %s
 ; RUN: opt -passes='default<O2>' -S < %s | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s
 
 ; We are worse at propagating correlation facts when in select form
 ; as compared to the PHI form, so if we lower switches to early,
diff --git a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
index 0e352ba52f6a..3fbb02d04629 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 define i8 @test_switch_with_sext_phi(i8 %code) {
 ; CHECK-LABEL: define noundef i8 @test_switch_with_sext_phi(
diff --git a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
index d2f33f9c3b75..33266caf7092 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes='default<O1>' < %s | FileCheck %s
 ; RUN: opt -S -passes='default<O2>' < %s | FileCheck %s
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 target datalayout = "n64"
 
diff --git a/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
new file mode 100644
index 000000000000..e81c5d739c6d
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=ipsccp %s | FileCheck %s
+
+declare void @use(i1)
+
+define i1 @test1(i64 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i64 [[X:%.*]], 65535
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], -65521
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond = icmp ugt i64 %x, 65535
+  call void @llvm.assume(i1 %cond)
+  %mask = and i64 %x, -65521
+  %cmp = icmp eq i64 %mask, 0
+  ret i1 %cmp
+}
+
+define void @test.and(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp uge i64 [[X:%.*]], 138
+; CHECK-NEXT:    [[C1:%.*]] = icmp ule i64 [[X]], 161
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i64 [[Y:%.*]], 186
+; CHECK-NEXT:    [[C3:%.*]] = icmp ule i64 [[Y]], 188
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ult i64 [[AND]], 137
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp uge i64 %x, 138 ; 0b10001010
+  %c1 = icmp ule i64 %x, 161 ; 0b10100000
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp uge i64 %y, 186 ; 0b10111010
+  %c3 = icmp ule i64 %y, 188 ; 0b10111110
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %and = and i64 %x, %y
+  %r0 = icmp ult i64 %and, 136 ; 0b10001000
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ult i64 %and, 137
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
+
+define void @test.or(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp ule i64 [[X:%.*]], 117
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i64 [[X]], 95
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp ule i64 [[Y:%.*]], 69
+; CHECK-NEXT:    [[C3:%.*]] = icmp uge i64 [[Y]], 67
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i64 [[OR]], 118
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp ule i64 %x, 117 ; 0b01110101
+  %c1 = icmp uge i64 %x, 95  ; 0b01011111
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp ule i64 %y, 69  ; 0b01000101
+  %c3 = icmp uge i64 %y, 67  ; 0b01000011
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %or = or i64 %x, %y
+  %r0 = icmp ugt i64 %or, 119 ; 0b01110111
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ugt i64 %or, 118
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 6dceabe1d324..00a4417ba7af 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -80,16 +80,11 @@ define half @reduce_fast_half8(<8 x half> %vec8) {
 ; NOFP16-LABEL: define half @reduce_fast_half8(
 ; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
 ; NOFP16-NEXT:  [[ENTRY:.*:]]
-; NOFP16-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; NOFP16-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; NOFP16-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; NOFP16-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
 ; NOFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; NOFP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; NOFP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
-; NOFP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; NOFP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; NOFP16-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NOFP16-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
+; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
 ; NOFP16-NEXT:    ret half [[OP_RDX3]]
 ;
 ; FULLFP16-LABEL: define half @reduce_fast_half8(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
new file mode 100644
index 000000000000..cf1ed54149b8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s |  FileCheck %s
+
+define i64 @test(ptr %arg, i32 %arg1, i64 %i) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[ARG:%.*]], i32 [[ARG1:%.*]], i64 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[I]]
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr i8, ptr [[I2]], i64 [[I]]
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr i8, ptr [[I3]], i64 [[I]]
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr [[I4]], i64 [[I]]
+; CHECK-NEXT:    [[I6:%.*]] = getelementptr i8, ptr [[I5]], i64 [[I]]
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr [[I6]], i64 [[I]]
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr [[I7]], i64 [[I]]
+; CHECK-NEXT:    [[I9:%.*]] = getelementptr i8, ptr [[I8]], i64 [[I]]
+; CHECK-NEXT:    [[I10:%.*]] = getelementptr i8, ptr [[I9]], i64 [[I]]
+; CHECK-NEXT:    [[I11:%.*]] = getelementptr i8, ptr [[I10]], i64 [[I]]
+; CHECK-NEXT:    [[I12:%.*]] = getelementptr i8, ptr [[I11]], i64 [[I]]
+; CHECK-NEXT:    [[I13:%.*]] = getelementptr i8, ptr [[I12]], i64 [[I]]
+; CHECK-NEXT:    [[I14:%.*]] = getelementptr i8, ptr [[I13]], i64 [[I]]
+; CHECK-NEXT:    [[I140:%.*]] = load i8, ptr [[I14]], align 1
+; CHECK-NEXT:    [[I1412:%.*]] = zext i8 [[I140]] to i32
+; CHECK-NEXT:    [[I142:%.*]] = mul i32 [[ARG1]], [[I1412]]
+; CHECK-NEXT:    [[I143:%.*]] = getelementptr i8, ptr [[I13]], i64 15
+; CHECK-NEXT:    [[I144:%.*]] = load i8, ptr [[I143]], align 1
+; CHECK-NEXT:    [[I1453:%.*]] = zext i8 [[I144]] to i32
+; CHECK-NEXT:    [[I146:%.*]] = mul i32 [[ARG1]], [[I1453]]
+; CHECK-NEXT:    [[I147:%.*]] = getelementptr i8, ptr [[I13]], i64 14
+; CHECK-NEXT:    [[I148:%.*]] = load i8, ptr [[I147]], align 1
+; CHECK-NEXT:    [[I1494:%.*]] = zext i8 [[I148]] to i32
+; CHECK-NEXT:    [[I150:%.*]] = mul i32 [[ARG1]], [[I1494]]
+; CHECK-NEXT:    [[I151:%.*]] = getelementptr i8, ptr [[I13]], i64 13
+; CHECK-NEXT:    [[I152:%.*]] = load i8, ptr [[I151]], align 1
+; CHECK-NEXT:    [[I1535:%.*]] = zext i8 [[I152]] to i32
+; CHECK-NEXT:    [[I154:%.*]] = mul i32 [[ARG1]], [[I1535]]
+; CHECK-NEXT:    [[I1311:%.*]] = or i32 [[I142]], [[I146]]
+; CHECK-NEXT:    [[I1312:%.*]] = or i32 [[I1311]], [[I150]]
+; CHECK-NEXT:    [[I1313:%.*]] = or i32 [[I1312]], [[I154]]
+; CHECK-NEXT:    [[I1536:%.*]] = zext i32 [[I1313]] to i64
+; CHECK-NEXT:    ret i64 [[I1536]]
+;
+bb:
+  %i2 = getelementptr i8, ptr %arg, i64 %i
+  %i3 = getelementptr i8, ptr %i2, i64 %i
+  %i4 = getelementptr i8, ptr %i3, i64 %i
+  %i5 = getelementptr i8, ptr %i4, i64 %i
+  %i6 = getelementptr i8, ptr %i5, i64 %i
+  %i7 = getelementptr i8, ptr %i6, i64 %i
+  %i8 = getelementptr i8, ptr %i7, i64 %i
+  %i9 = getelementptr i8, ptr %i8, i64 %i
+  %i10 = getelementptr i8, ptr %i9, i64 %i
+  %i11 = getelementptr i8, ptr %i10, i64 %i
+  %i12 = getelementptr i8, ptr %i11, i64 %i
+  %i13 = getelementptr i8, ptr %i12, i64 %i
+  %i14 = getelementptr i8, ptr %i13, i64 %i
+  %i140 = load i8, ptr %i14, align 1
+  %i1412 = zext i8 %i140 to i32
+  %i142 = mul i32 %arg1, %i1412
+  %i143 = getelementptr i8, ptr %i13, i64 15
+  %i144 = load i8, ptr %i143, align 1
+  %i1453 = zext i8 %i144 to i32
+  %i146 = mul i32 %arg1, %i1453
+  %i147 = getelementptr i8, ptr %i13, i64 14
+  %i148 = load i8, ptr %i147, align 1
+  %i1494 = zext i8 %i148 to i32
+  %i150 = mul i32 %arg1, %i1494
+  %i151 = getelementptr i8, ptr %i13, i64 13
+  %i152 = load i8, ptr %i151, align 1
+  %i1535 = zext i8 %i152 to i32
+  %i154 = mul i32 %arg1, %i1535
+  %i1311 = or i32 %i142, %i146
+  %i1312 = or i32 %i1311, %i150
+  %i1313 = or i32 %i1312, %i154
+  %i1536 = zext i32 %i1313 to i64
+  ret i64 %i1536
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll
new file mode 100644
index 000000000000..10c6aeb26ba7
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; Test to ensure that memprof related metadata is not dropped when
+;; instructions are combined. Currently the metadata from the first instruction
+;; is kept, which prevents full loss of profile context information.
+
+; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local noundef nonnull ptr @_Z4testb(i1 noundef zeroext %b) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local noundef nonnull ptr @_Z4testb(
+; CHECK-SAME: i1 noundef zeroext [[B:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof [[META0:![0-9]+]], !callsite [[META3:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !0, !callsite !3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !4, !callsite !7
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+
+declare ptr @_Znwm(i64) nounwind readonly
+
+!0 = !{!1}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+!3 = !{i64 -852997907418798798}
+!4 = !{!5}
+!5 = !{!6, !"cold"}
+!6 = !{i64 123, i64 -2101080423462424381, i64 5188446645037944434}
+!7 = !{i64 123}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], !"notcold"}
+; CHECK: [[META2]] = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+; CHECK: [[META3]] = !{i64 -852997907418798798}
+;.
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
index 92766d5a11aa..420e844b5103 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
@@ -11,7 +11,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;       use((b + i) * s);
 ;   }
 ; }
-define void @foo(i32 %b, i32 %s) {
+define ptx_kernel void @foo(i32 %b, i32 %s) {
 ; CHECK-LABEL: .visible .entry foo(
 entry:
 ; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1];
@@ -65,7 +65,3 @@ for.inc.3:                                        ; preds = %if.then.3, %for.inc
 declare zeroext i1 @cond(i32)
 
 declare void @use(i32)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
index 057d9af314ba..c3639baf8b65 100644
--- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
@@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 }
 
 define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <16 x i8> %v0, zeroinitializer
   %c1 = icmp slt <16 x i8> %v1, zeroinitializer
@@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2,
 }
 
 define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX2-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX512-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <4 x i32> %v0, zeroinitializer
   %c1 = icmp slt <4 x i32> %v1, zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 800f57646a3e..f3b7f7b72ee4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -465,33 +465,13 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 poison, i32 2, i32 4, i32 poison>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[V3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 307fbf711cdc..c125b73fccdd 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -465,34 +465,19 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 5c856ce397c1..cd2bc757eb9d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -91,20 +91,11 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
   ret <4 x double> %r
 }
 
-; The vector fneg would cost twice as much as the scalar op with SSE,
-; so we don't transform there (the shuffle would also be more expensive).
-
 define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
-; SSE-LABEL: @ext7_v8f32(
-; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; SSE-NEXT:    [[N:%.*]] = fneg float [[E]]
-; SSE-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R]]
-;
-; AVX-LABEL: @ext7_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
-; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
-; AVX-NEXT:    ret <8 x float> [[R]]
+; CHECK-LABEL: @ext7_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <8 x float> %x, i32 7
   %n = fneg float %e
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
new file mode 100644
index 000000000000..0acfeccb92ef
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+
+; infinite loop if we add the erased instructions to the work list in the wrong order.
+define void @multiple_extract(ptr %p) {
+; CHECK-LABEL: @multiple_extract(
+; CHECK-NEXT:    [[VP:%.*]] = load ptr, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 0
+; CHECK-NEXT:    [[E0:%.*]] = load i32, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 1
+; CHECK-NEXT:    [[E1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[E0]], ptr [[P]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    store i32 [[E1]], ptr [[P1]], align 4
+; CHECK-NEXT:    ret void
+;
+  %vp = load ptr, ptr %p, align 8
+  %v = load <2 x i32>, ptr %vp, align 16
+  %e0 = extractelement <2 x i32> %v, i64 0
+  %e1 = extractelement <2 x i32> %v, i64 1
+  store i32 %e0, ptr %p, align 4
+  %p1 = getelementptr inbounds nuw i8, ptr %p, i64 4
+  store i32 %e1, ptr %p1, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 937d4043adc0..2db1e21b3e95 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -544,10 +544,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc
 ; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
 ; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
 ; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; CHECK-NEXT:    store <2 x float> [[T3]], ptr [[RESULTPTR:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index b3360b61e66e..f9108efa7ee7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
 
 declare void @use(<4 x i1>)
 
@@ -105,8 +105,8 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
 define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
 ; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE2-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE2-NEXT:    ret <4 x i32> [[R]]
@@ -115,21 +115,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
 ; SSE4-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
 ; SSE4-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE4-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE4-NEXT:    ret <4 x i32> [[R]]
 ;
-; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; AVX-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; AVX-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX-NEXT:    ret <4 x i32> [[R]]
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX2-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX512-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp oeq <4 x float> %x, %y
   %b1 = fcmp oeq <4 x float> %x, %z
-  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
@@ -137,29 +145,13 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; For commutative instructions, common operand may be swapped
 
 define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; SSE-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; SSE-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
-; AVX2-NEXT:    [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
-; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX2-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; AVX512-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX512-NEXT:    ret <4 x i32> [[R]]
+; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp one <4 x float> %x, %y
   %b1 = fcmp one <4 x float> %z, %x
@@ -275,3 +267,33 @@ define <4 x i32> @shuf_icmp_ugt_v4i32_use(<4 x i32> %x, <4 x i32> %y, <4 x i32>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
+
+; PR121110 - don't merge equivalent (but not matching) predicates
+
+define <2 x i1> @PR121110() {
+; CHECK-LABEL: define <2 x i1> @PR121110(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[UGT]], <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %res = shufflevector <2 x i1> %ugt, <2 x i1> %sgt, <2 x i32> <i32 0, i32 3>
+  ret <2 x i1> %res
+}
+
+define <2 x i1> @PR121110_commute() {
+; CHECK-LABEL: define <2 x i1> @PR121110_commute(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> [[UGT]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %res = shufflevector <2 x i1> %sgt, <2 x i1> %ugt, <2 x i32> <i32 0, i32 3>
+  ret <2 x i1> %res
+}