Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format

author: Mingming Liu <mingmingl@google.com> 2025-09-10 15:25:31 -0700
committer: GitHub <noreply@github.com> 2025-09-10 15:25:31 -0700
commit: 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree: 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/test/Analysis
parent: 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parent: b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
68 files changed, 3381 insertions, 2138 deletions
diff --git a/llvm/test/Analysis/BasicAA/featuretest.ll b/llvm/test/Analysis/BasicAA/featuretest.ll
index fd5d2eff10f8..e4cb009f0c63 100644
--- a/llvm/test/Analysis/BasicAA/featuretest.ll
+++ b/llvm/test/Analysis/BasicAA/featuretest.ll
@@ -102,12 +102,14 @@ define i32 @gep_distance_test(ptr %A) {
 ; cannot alias, even if there is a variable offset between them...
 define i32 @gep_distance_test2(ptr %A, i64 %distance) {
 ; NO_ASSUME-LABEL: @gep_distance_test2(
-; NO_ASSUME-NEXT:    [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]], i32 1
+; NO_ASSUME-NEXT:    [[B_SPLIT:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]]
+; NO_ASSUME-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[B_SPLIT]], i64 4
 ; NO_ASSUME-NEXT:    store i32 7, ptr [[B]], align 4
 ; NO_ASSUME-NEXT:    ret i32 0
 ;
 ; USE_ASSUME-LABEL: @gep_distance_test2(
-; USE_ASSUME-NEXT:    [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]], i32 1
+; USE_ASSUME-NEXT:    [[B_SPLIT:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]]
+; USE_ASSUME-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[B_SPLIT]], i64 4
 ; USE_ASSUME-NEXT:    store i32 7, ptr [[B]], align 4
 ; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4), "nonnull"(ptr [[A]]), "align"(ptr [[A]], i64 4) ]
 ; USE_ASSUME-NEXT:    ret i32 0
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
new file mode 100644
index 000000000000..7e980c9bfe38
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -0,0 +1,248 @@
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.32 = getelementptr inbounds nuw i8, ptr %0, i64 32
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <4 x float> @dead_scalable_store_fixed(ptr %0) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+  %mask2 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 3)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.36 = getelementptr inbounds nuw i8, ptr %0, i64 36
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.36 = getelementptr inbounds nuw i8, ptr %arr, i64 36
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+
+  %load.0.36 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.36, i32 1, <4 x i1> %mask2, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.36, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+
+  %load.0.48 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+
+  %faddop0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %faddop1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %fadd = fadd <4 x float> %faddop0, %faddop1
+
+  ret <4 x float> %fadd
+}
+
+define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+  %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+  %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+  %arr = alloca [64 x i32], align 4
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+  %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+  %gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
+  %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+  %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+  %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
+
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+  %load.0.46 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+
+  %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
+  %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
+  %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+  ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
+; CHECK-NOT: store i32 20, ptr %gep.1.12
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 20, ptr %gep.1.12
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-NOT: store i32 20, ptr %1
+; CHECK: store i32 50, ptr %gep.5
+define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+  %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+  store i32 20, ptr %1
+
+  %gep.5 = getelementptr inbounds nuw i32, ptr %1, i64 5
+  store i32 50, ptr %gep.5
+
+  %load.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0, ptr nonnull %1, i32 1, <4 x i1> %mask)
+  %retval = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  ret <4 x float> %retval
+}
+
+; We don't know if the scalar store is dead as we can't determine vscale.
+; This get active lane mask may cover 4 or 8 integers
+define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 10, ptr %gep.1.12
+; CHECK: store i32 20, ptr %gep.1.28
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 10, ptr %gep.1.12
+  %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
+  store i32 20, ptr %gep.1.28
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the mask's Op1 < Op0
+define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
+; CHECK: store i32 20, ptr %1
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+  store i32 20, ptr %1
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the mask's Op1 == Op0
+define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
+; CHECK: store i32 20, ptr %1
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
+  store i32 20, ptr %1
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
+
+define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
+; CHECK-NOT: store i8 60, ptr %gep.1.6
+; CHECK: store i8 120, ptr %gep.1.8
+;
+  %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
+  %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
+  store i8 60, ptr %gep.1.6
+  %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+  store i8 120, ptr %gep.1.8
+
+  %load.0 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+  call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
+  %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %retval
+}
+
+define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
+; CHECK-NOT: store i32 10, ptr %gep.1.0
+; CHECK-NOT: store i32 20, ptr %gep.1.4
+; CHECK-NOT: store i32 30, ptr %gep.1.8
+; CHECK: store i32 40, ptr %gep.1.12
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+  %gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
+  store i32 10, ptr %gep.1.0
+  %gep.1.4 = getelementptr inbounds nuw i8, ptr %1, i64 4
+  store i32 20, ptr %gep.1.4
+  %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+  store i32 30, ptr %gep.1.8
+  %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+  store i32 40, ptr %gep.1.12
+
+  %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+  %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  ret <vscale x 4 x float> %retval
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
index 867c35ab7944..07764fbf4acf 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s
 
+; This tests the cost of fixed-length subvector extracts for NEON.
+; For the SVE equivalent test, see sve-vls-shuffle-extract.ll
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @extract_half() {
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll
index 0789707a53b9..5029ef974681 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll
@@ -58,10 +58,10 @@ define <vscale x 32 x i1> @cmp_nxv32i1() {
 ; Check fcmp for legal FP vectors
 define void @cmp_legal_fp() #0 {
 ; CHECK-LABEL: 'cmp_legal_fp'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %1 = fcmp oge <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %2 = fcmp oge <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %3 = fcmp oge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %4 = fcmp oge <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %1 = fcmp oge <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %2 = fcmp oge <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %3 = fcmp oge <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %4 = fcmp oge <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %1 = fcmp oge <vscale x 2 x double> undef, undef
@@ -74,7 +74,7 @@ define void @cmp_legal_fp() #0 {
 ; Check fcmp for an illegal FP vector
 define <vscale x 16 x i1> @cmp_nxv16f16() {
 ; CHECK-LABEL: 'cmp_nxv16f16'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = fcmp oge <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %res = fcmp oge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 16 x i1> %res
 ;
   %res = fcmp oge <vscale x 16 x half> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
index a14176cd6d53..140c53b1eb10 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
@@ -1,18 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 | FileCheck %s --check-prefix=CHECK-COST2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 -mcpu=cortex-a510 | FileCheck %s --check-prefix=CHECK-COST2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 -mcpu=neoverse-v2 | FileCheck %s --check-prefix=CHECK-COST2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 -mcpu=a64fx | FileCheck %s --check-prefix=CHECK-COST2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 -mcpu=ampere1b | FileCheck %s --check-prefix=CHECK-COST1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 -mcpu=apple-m4 | FileCheck %s --check-prefix=CHECK-COST1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve2 -mcpu=cortex-a55 | FileCheck %s --check-prefix=CHECK-COST1
 
 define void @fcmp_oeq(i32 %arg) {
-; CHECK-LABEL: 'fcmp_oeq'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp oeq <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp oeq <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oeq <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp oeq <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oeq <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp oeq <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp oeq <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp oeq <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oeq <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_oeq'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp oeq <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp oeq <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp oeq <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp oeq <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp oeq <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_oeq'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp oeq <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp oeq <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oeq <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp oeq <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oeq <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp oeq <vscale x 2 x float> undef, undef
   %v4f32 = fcmp oeq <vscale x 4 x float> undef, undef
@@ -27,11 +45,17 @@ define void @fcmp_oeq(i32 %arg) {
 }
 
 define void @fcmp_oeq_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_oeq_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp oeq <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp oeq <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oeq <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_oeq_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp oeq <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp oeq <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oeq <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_oeq_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp oeq <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp oeq <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oeq <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp oeq <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp oeq <vscale x 4 x bfloat> undef, undef
@@ -40,17 +64,29 @@ define void @fcmp_oeq_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ogt(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ogt'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ogt <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ogt <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ogt <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ogt <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ogt <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ogt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ogt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ogt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ogt <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ogt'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ogt <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ogt <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp ogt <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ogt <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp ogt <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ogt'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ogt <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ogt <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ogt <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ogt <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ogt <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ogt <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ogt <vscale x 4 x float> undef, undef
@@ -65,11 +101,17 @@ define void @fcmp_ogt(i32 %arg) {
 }
 
 define void @fcmp_ogt_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ogt_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ogt <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ogt <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ogt <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ogt_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ogt <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ogt <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ogt <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ogt_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ogt <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ogt <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ogt <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ogt <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ogt <vscale x 4 x bfloat> undef, undef
@@ -78,17 +120,29 @@ define void @fcmp_ogt_bfloat(i32 %arg) {
 }
 
 define void @fcmp_oge(i32 %arg) {
-; CHECK-LABEL: 'fcmp_oge'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp oge <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp oge <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oge <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp oge <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oge <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp oge <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp oge <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp oge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oge <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_oge'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp oge <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp oge <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp oge <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp oge <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp oge <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_oge'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp oge <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp oge <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oge <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp oge <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oge <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp oge <vscale x 2 x float> undef, undef
   %v4f32 = fcmp oge <vscale x 4 x float> undef, undef
@@ -103,11 +157,17 @@ define void @fcmp_oge(i32 %arg) {
 }
 
 define void @fcmp_oge_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_oge_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp oge <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp oge <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oge <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_oge_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp oge <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp oge <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oge <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_oge_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp oge <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp oge <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oge <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp oge <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp oge <vscale x 4 x bfloat> undef, undef
@@ -116,17 +176,29 @@ define void @fcmp_oge_bfloat(i32 %arg) {
 }
 
 define void @fcmp_olt(i32 %arg) {
-; CHECK-LABEL: 'fcmp_olt'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp olt <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp olt <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp olt <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp olt <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp olt <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp olt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp olt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp olt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp olt <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_olt'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp olt <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp olt <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp olt <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp olt <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp olt <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_olt'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp olt <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp olt <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp olt <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp olt <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp olt <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp olt <vscale x 2 x float> undef, undef
   %v4f32 = fcmp olt <vscale x 4 x float> undef, undef
@@ -141,11 +213,17 @@ define void @fcmp_olt(i32 %arg) {
 }
 
 define void @fcmp_olt_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_olt_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp olt <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp olt <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp olt <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_olt_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp olt <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp olt <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp olt <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_olt_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp olt <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp olt <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp olt <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp olt <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp olt <vscale x 4 x bfloat> undef, undef
@@ -154,17 +232,29 @@ define void @fcmp_olt_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ole(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ole'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ole <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ole <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ole <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ole <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ole <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ole <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ole <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ole <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ole <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ole'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ole <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ole <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp ole <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ole <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp ole <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ole'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ole <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ole <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ole <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ole <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ole <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ole <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ole <vscale x 4 x float> undef, undef
@@ -179,11 +269,17 @@ define void @fcmp_ole(i32 %arg) {
 }
 
 define void @fcmp_ole_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ole_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ole <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ole <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ole <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ole_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ole <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ole <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ole <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ole_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ole <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ole <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ole <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ole <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ole <vscale x 4 x bfloat> undef, undef
@@ -192,17 +288,29 @@ define void @fcmp_ole_bfloat(i32 %arg) {
 }
 
 define void @fcmp_one(i32 %arg) {
-; CHECK-LABEL: 'fcmp_one'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f32 = fcmp one <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f32 = fcmp one <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %v8f32 = fcmp one <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f64 = fcmp one <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %v4f64 = fcmp one <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f16 = fcmp one <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f16 = fcmp one <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v8f16 = fcmp one <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %v16f16 = fcmp one <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_one'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v2f32 = fcmp one <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v4f32 = fcmp one <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %v8f32 = fcmp one <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v2f64 = fcmp one <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %v4f64 = fcmp one <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v2f16 = fcmp one <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v4f16 = fcmp one <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v8f16 = fcmp one <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %v16f16 = fcmp one <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_one'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f32 = fcmp one <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f32 = fcmp one <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 6 for: %v8f32 = fcmp one <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f64 = fcmp one <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 6 for: %v4f64 = fcmp one <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f16 = fcmp one <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f16 = fcmp one <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v8f16 = fcmp one <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 6 for: %v16f16 = fcmp one <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp one <vscale x 2 x float> undef, undef
   %v4f32 = fcmp one <vscale x 4 x float> undef, undef
@@ -217,11 +325,17 @@ define void @fcmp_one(i32 %arg) {
 }
 
 define void @fcmp_one_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_one_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v2bf16 = fcmp one <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v4bf16 = fcmp one <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:9 Lat:9 SizeLat:9 for: %v8bf16 = fcmp one <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_one_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:8 SizeLat:5 for: %v2bf16 = fcmp one <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:8 SizeLat:5 for: %v4bf16 = fcmp one <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:9 Lat:9 SizeLat:9 for: %v8bf16 = fcmp one <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_one_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v2bf16 = fcmp one <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v4bf16 = fcmp one <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:9 Lat:9 SizeLat:9 for: %v8bf16 = fcmp one <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp one <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp one <vscale x 4 x bfloat> undef, undef
@@ -230,17 +344,29 @@ define void @fcmp_one_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ord(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ord'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ord <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ord <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ord <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ord <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ord <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ord <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ord <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ord <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ord <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ord'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ord <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ord <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp ord <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ord <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp ord <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ord <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ord <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ord <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp ord <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ord'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ord <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ord <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ord <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ord <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ord <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ord <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ord <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ord <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ord <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ord <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ord <vscale x 4 x float> undef, undef
@@ -255,11 +381,17 @@ define void @fcmp_ord(i32 %arg) {
 }
 
 define void @fcmp_ord_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ord_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ord <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ord <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ord <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ord_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ord <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ord <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ord <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ord_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ord <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ord <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ord <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ord <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ord <vscale x 4 x bfloat> undef, undef
@@ -268,17 +400,29 @@ define void @fcmp_ord_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ueq(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ueq'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f32 = fcmp ueq <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f32 = fcmp ueq <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %v8f32 = fcmp ueq <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f64 = fcmp ueq <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %v4f64 = fcmp ueq <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f16 = fcmp ueq <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f16 = fcmp ueq <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v8f16 = fcmp ueq <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %v16f16 = fcmp ueq <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ueq'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v2f32 = fcmp ueq <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v4f32 = fcmp ueq <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %v8f32 = fcmp ueq <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v2f64 = fcmp ueq <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %v4f64 = fcmp ueq <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:3 for: %v8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %v16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ueq'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f32 = fcmp ueq <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f32 = fcmp ueq <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 6 for: %v8f32 = fcmp ueq <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f64 = fcmp ueq <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 6 for: %v4f64 = fcmp ueq <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:6 SizeLat:3 for: %v8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 6 for: %v16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ueq <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ueq <vscale x 4 x float> undef, undef
@@ -293,11 +437,17 @@ define void @fcmp_ueq(i32 %arg) {
 }
 
 define void @fcmp_ueq_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ueq_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v2bf16 = fcmp ueq <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v4bf16 = fcmp ueq <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:9 Lat:9 SizeLat:9 for: %v8bf16 = fcmp ueq <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ueq_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:8 SizeLat:5 for: %v2bf16 = fcmp ueq <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:8 SizeLat:5 for: %v4bf16 = fcmp ueq <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:9 Lat:9 SizeLat:9 for: %v8bf16 = fcmp ueq <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ueq_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v2bf16 = fcmp ueq <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:8 SizeLat:5 for: %v4bf16 = fcmp ueq <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:9 Lat:9 SizeLat:9 for: %v8bf16 = fcmp ueq <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ueq <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ueq <vscale x 4 x bfloat> undef, undef
@@ -306,17 +456,29 @@ define void @fcmp_ueq_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ugt(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ugt'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ugt <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ugt <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ugt <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ugt <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ugt <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ugt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ugt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ugt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ugt <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ugt'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ugt <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ugt <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp ugt <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ugt <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp ugt <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ugt'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ugt <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ugt <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ugt <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ugt <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ugt <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ugt <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ugt <vscale x 4 x float> undef, undef
@@ -331,11 +493,17 @@ define void @fcmp_ugt(i32 %arg) {
 }
 
 define void @fcmp_ugt_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ugt_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ugt <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ugt <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ugt <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ugt_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ugt <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ugt <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ugt <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ugt_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ugt <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ugt <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ugt <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ugt <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ugt <vscale x 4 x bfloat> undef, undef
@@ -344,17 +512,29 @@ define void @fcmp_ugt_bfloat(i32 %arg) {
 }
 
 define void @fcmp_uge(i32 %arg) {
-; CHECK-LABEL: 'fcmp_uge'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp uge <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp uge <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uge <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp uge <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uge <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp uge <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp uge <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp uge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uge <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_uge'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp uge <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp uge <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp uge <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp uge <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp uge <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_uge'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp uge <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp uge <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uge <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp uge <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uge <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp uge <vscale x 2 x float> undef, undef
   %v4f32 = fcmp uge <vscale x 4 x float> undef, undef
@@ -369,11 +549,17 @@ define void @fcmp_uge(i32 %arg) {
 }
 
 define void @fcmp_uge_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_uge_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp uge <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp uge <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uge <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_uge_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp uge <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp uge <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uge <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_uge_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp uge <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp uge <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uge <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp uge <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp uge <vscale x 4 x bfloat> undef, undef
@@ -382,17 +568,29 @@ define void @fcmp_uge_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ult(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ult'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ult <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ult <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ult <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ult <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ult <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ult <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ult <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ult <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ult <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ult'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ult <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ult <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp ult <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ult <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp ult <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ult'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ult <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ult <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ult <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ult <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ult <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ult <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ult <vscale x 4 x float> undef, undef
@@ -407,11 +605,17 @@ define void @fcmp_ult(i32 %arg) {
 }
 
 define void @fcmp_ult_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ult_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ult <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ult <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ult <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ult_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ult <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ult <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ult <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ult_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ult <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ult <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ult <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ult <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ult <vscale x 4 x bfloat> undef, undef
@@ -420,17 +624,29 @@ define void @fcmp_ult_bfloat(i32 %arg) {
 }
 
 define void @fcmp_ule(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ule'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ule <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ule <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ule <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ule <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ule <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ule <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ule <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ule <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ule <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ule'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ule <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ule <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp ule <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ule <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp ule <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ule'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp ule <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp ule <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ule <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp ule <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ule <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ule <vscale x 2 x float> undef, undef
   %v4f32 = fcmp ule <vscale x 4 x float> undef, undef
@@ -445,12 +661,19 @@ define void @fcmp_ule(i32 %arg) {
 }
 
 define void @fcmp_ule_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_ule_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ule <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ule <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ule <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp ule <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_ule_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ule <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ule <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ule <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp ule <vscale x 16 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_ule_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp ule <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp ule <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ule <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp ule <vscale x 16 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ule <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp ule <vscale x 4 x bfloat> undef, undef
@@ -460,17 +683,29 @@ define void @fcmp_ule_bfloat(i32 %arg) {
 }
 
 define void @fcmp_une(i32 %arg) {
-; CHECK-LABEL: 'fcmp_une'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp une <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp une <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp une <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp une <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp une <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp une <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp une <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp une <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp une <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_une'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp une <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp une <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp une <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp une <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp une <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp une <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp une <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp une <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp une <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_une'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp une <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp une <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp une <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp une <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp une <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp une <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp une <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp une <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp une <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp une <vscale x 2 x float> undef, undef
   %v4f32 = fcmp une <vscale x 4 x float> undef, undef
@@ -485,12 +720,19 @@ define void @fcmp_une(i32 %arg) {
 }
 
 define void @fcmp_une_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_une_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp une <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp une <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp une <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp une <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_une_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp une <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp une <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp une <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp une <vscale x 16 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_une_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp une <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp une <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp une <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp une <vscale x 16 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp une <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp une <vscale x 4 x bfloat> undef, undef
@@ -500,17 +742,29 @@ define void @fcmp_une_bfloat(i32 %arg) {
 }
 
 define void @fcmp_uno(i32 %arg) {
-; CHECK-LABEL: 'fcmp_uno'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp uno <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp uno <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uno <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp uno <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uno <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp uno <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp uno <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp uno <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uno <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_uno'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp uno <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp uno <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp uno <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp uno <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp uno <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp uno <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp uno <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp uno <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp uno <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_uno'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp uno <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp uno <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uno <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp uno <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uno <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp uno <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp uno <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp uno <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uno <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp uno <vscale x 2 x float> undef, undef
   %v4f32 = fcmp uno <vscale x 4 x float> undef, undef
@@ -525,12 +779,19 @@ define void @fcmp_uno(i32 %arg) {
 }
 
 define void @fcmp_uno_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_uno_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp uno <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp uno <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uno <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp uno <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_uno_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp uno <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp uno <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uno <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp uno <vscale x 16 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_uno_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp uno <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp uno <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uno <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp uno <vscale x 16 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp uno <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp uno <vscale x 4 x bfloat> undef, undef
@@ -540,17 +801,29 @@ define void @fcmp_uno_bfloat(i32 %arg) {
 }
 
 define void @fcmp_true(i32 %arg) {
-; CHECK-LABEL: 'fcmp_true'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp true <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp true <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp true <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp true <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp true <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp true <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp true <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp true <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp true <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_true'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp true <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp true <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp true <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp true <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp true <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp true <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp true <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp true <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp true <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_true'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp true <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp true <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp true <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp true <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp true <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp true <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp true <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp true <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp true <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp true <vscale x 2 x float> undef, undef
   %v4f32 = fcmp true <vscale x 4 x float> undef, undef
@@ -565,12 +838,19 @@ define void @fcmp_true(i32 %arg) {
 }
 
 define void @fcmp_true_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_true_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp true <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp true <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp true <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp true <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_true_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp true <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp true <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp true <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp true <vscale x 16 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_true_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp true <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp true <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp true <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp true <vscale x 16 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp true <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp true <vscale x 4 x bfloat> undef, undef
@@ -580,17 +860,29 @@ define void @fcmp_true_bfloat(i32 %arg) {
 }
 
 define void @fcmp_false(i32 %arg) {
-; CHECK-LABEL: 'fcmp_false'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp false <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp false <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp false <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp false <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp false <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp false <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp false <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp false <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp false <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_false'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp false <vscale x 2 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp false <vscale x 4 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v8f32 = fcmp false <vscale x 8 x float> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp false <vscale x 2 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v4f64 = fcmp false <vscale x 4 x double> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp false <vscale x 2 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp false <vscale x 4 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp false <vscale x 8 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %v16f16 = fcmp false <vscale x 16 x half> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_false'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f32 = fcmp false <vscale x 2 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f32 = fcmp false <vscale x 4 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp false <vscale x 8 x float> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f64 = fcmp false <vscale x 2 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp false <vscale x 4 x double> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v2f16 = fcmp false <vscale x 2 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v4f16 = fcmp false <vscale x 4 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:1 for: %v8f16 = fcmp false <vscale x 8 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp false <vscale x 16 x half> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp false <vscale x 2 x float> undef, undef
   %v4f32 = fcmp false <vscale x 4 x float> undef, undef
@@ -605,12 +897,19 @@ define void @fcmp_false(i32 %arg) {
 }
 
 define void @fcmp_false_bfloat(i32 %arg) {
-; CHECK-LABEL: 'fcmp_false_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp false <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp false <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp false <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp false <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-COST2-LABEL: 'fcmp_false_bfloat'
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp false <vscale x 2 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp false <vscale x 4 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp false <vscale x 8 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp false <vscale x 16 x bfloat> undef, undef
+; CHECK-COST2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-COST1-LABEL: 'fcmp_false_bfloat'
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v2bf16 = fcmp false <vscale x 2 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:3 for: %v4bf16 = fcmp false <vscale x 4 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp false <vscale x 8 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:5 SizeLat:7 for: %v16bf16 = fcmp false <vscale x 16 x bfloat> undef, undef
+; CHECK-COST1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp false <vscale x 2 x bfloat> undef, undef
   %v4bf16 = fcmp false <vscale x 4 x bfloat> undef, undef
@@ -618,4 +917,3 @@ define void @fcmp_false_bfloat(i32 %arg) {
   %v16bf16 = fcmp false <vscale x 16 x bfloat> undef, undef
   ret void
 }
-
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-vls-shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/sve-vls-shuffle-extract.ll
new file mode 100644
index 000000000000..65261a8b2c6d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-vls-shuffle-extract.ll
@@ -0,0 +1,339 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output \
+; RUN:    -aarch64-sve-vector-bits-min=128 \
+; RUN:    | FileCheck %s --check-prefixes=VSCALE-ANY,VSCALE-1
+; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output \
+; RUN:    -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 \
+; RUN:    | FileCheck %s --check-prefixes=VSCALE-ANY,VSCALE-1
+; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output \
+; RUN:    -aarch64-sve-vector-bits-min=256 \
+; RUN:    | FileCheck %s --check-prefixes=VSCALE-ANY,VSCALE-2
+; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output \
+; RUN:    -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 \
+; RUN:    | FileCheck %s --check-prefixes=VSCALE-ANY,VSCALE-2
+; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output \
+; RUN:    -aarch64-sve-vector-bits-min=512 \
+; RUN:    | FileCheck %s --check-prefixes=VSCALE-ANY,VSCALE-4
+; RUN: opt < %s -mtriple=aarch64 -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output \
+; RUN:    -aarch64-sve-vector-bits-min=512 -aarch64-sve-vector-bits-max=512 \
+; RUN:    | FileCheck %s --check-prefixes=VSCALE-ANY,VSCALE-4
+
+; This tests the cost of fixed-length subvector extracts for SVE,
+; either for a minimum vscale or a fixed vscale (aka VLS).
+; For the NEON equivalent test, see shuffle-extract.ll
+
+define void @extract_half_lo() {
+; VSCALE-ANY-LABEL: 'extract_half_lo'
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v32i8_lo = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i8_lo = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v32i16_lo = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> zeroinitializer
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v32i8_lo = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v64i8_lo = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v32i16_lo = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 0>
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  ret void
+}
+
+define void @extract_half_hi() {
+; VSCALE-1-LABEL: 'extract_half_hi'
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-2-LABEL: 'extract_half_hi'
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-4-LABEL: 'extract_half_hi'
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  ret void
+}
+
+define void @extract_half_unaligned() {
+; VSCALE-1-LABEL: 'extract_half_unaligned'
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-2-LABEL: 'extract_half_unaligned'
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-4-LABEL: 'extract_half_unaligned'
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+
+  ret void
+}
+
+define void @extract_qtr_lo() {
+; VSCALE-ANY-LABEL: 'extract_qtr_lo'
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v32i8_lo = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i8_lo = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v32i16_lo = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+; VSCALE-ANY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v32i8_lo = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v64i8_lo = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v32i16_lo = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 0>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 0>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+
+  ret void
+}
+
+define void @extract_qtr_hi() {
+; VSCALE-1-LABEL: 'extract_qtr_hi'
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-2-LABEL: 'extract_qtr_hi'
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-4-LABEL: 'extract_qtr_hi'
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v32i8_hi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v64i8_hi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v32i16_hi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+
+  ret void
+}
+
+define void @extract_qtr_unaligned() {
+; VSCALE-1-LABEL: 'extract_qtr_unaligned'
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-2-LABEL: 'extract_qtr_unaligned'
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VSCALE-4-LABEL: 'extract_qtr_unaligned'
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+; VSCALE-4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v32i8_mi = shufflevector <32 x i8> poison, <32 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v64i8_mi = shufflevector <64 x i8> poison, <64 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v32i16_mi = shufflevector <32 x i16> poison, <32 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll
index cc8f2da57f07..254c191569f8 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -31,55 +30,30 @@ declare <32 x i8>  @llvm.abs.v32i8(<32 x i8>, i1)
 declare <64 x i8>  @llvm.abs.v64i8(<64 x i8>, i1)
 
 define i32 @abs(i32 %arg) {
-; MVE-RECIP-LABEL: 'abs'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-SIZE-LABEL: 'abs'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; MVE-LABEL: 'abs'
+; MVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:5 SizeLat:5 for: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:55 Lat:74 SizeLat:74 for: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:148 CodeSize:110 Lat:148 SizeLat:148 for: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:296 CodeSize:220 Lat:296 SizeLat:296 for: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:29 Lat:40 SizeLat:40 for: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:29 Lat:40 SizeLat:40 for: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:29 Lat:40 SizeLat:40 for: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
   %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-active_lane_mask.ll b/llvm/test/Analysis/CostModel/ARM/mve-active_lane_mask.ll
index 664b828fa9fd..a12fd00589b6 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-active_lane_mask.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-active_lane_mask.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -9,8 +9,8 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @v4i32(i32 %index, i32 %TC) {
 ; CHECK-LABEL: 'v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
   ret void
@@ -18,8 +18,8 @@ define void @v4i32(i32 %index, i32 %TC) {
 
 define void @v8i16(i32 %index, i32 %TC) {
 ; CHECK-LABEL: 'v8i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
   ret void
@@ -27,8 +27,8 @@ define void @v8i16(i32 %index, i32 %TC) {
 
 define void @v16i8(i32 %index, i32 %TC) {
 ; CHECK-LABEL: 'v16i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll b/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll
index fa1cf17b9174..4e6ebbc59396 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll
@@ -1,30 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @icmp() {
 ; CHECK-LABEL: 'icmp'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2i8 = icmp slt <2 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = icmp slt <4 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = icmp slt <8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = icmp slt <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2i16 = icmp slt <2 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = icmp slt <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = icmp slt <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2i32 = icmp slt <2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = icmp slt <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v2i64 = icmp slt <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v2i128 = icmp slt <2 x i128> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v4i128 = icmp slt <4 x i128> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %v2i8 = icmp slt <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = icmp slt <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = icmp slt <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = icmp slt <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:130 Lat:132 SizeLat:132 for: %v32i8 = icmp slt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %v2i16 = icmp slt <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = icmp slt <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = icmp slt <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:66 Lat:68 SizeLat:68 for: %v16i16 = icmp slt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %v2i32 = icmp slt <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = icmp slt <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:34 Lat:36 SizeLat:36 for: %v8i32 = icmp slt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:68 Lat:72 SizeLat:72 for: %v16i32 = icmp slt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:18 Lat:36 SizeLat:36 for: %v2i64 = icmp slt <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:36 Lat:72 SizeLat:72 for: %v4i64 = icmp slt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:144 CodeSize:72 Lat:144 SizeLat:144 for: %v8i64 = icmp slt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:34 Lat:68 SizeLat:68 for: %v2i128 = icmp slt <2 x i128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:136 CodeSize:68 Lat:136 SizeLat:136 for: %v4i128 = icmp slt <4 x i128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = icmp slt <2 x i8> undef, undef
   %v4i8 = icmp slt <4 x i8> undef, undef
@@ -54,32 +54,32 @@ define void @icmp() {
 
 define void @fcmp() {
 ; CHECK-MVE-LABEL: 'fcmp'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 12 for: %v2f16 = fcmp olt <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 24 for: %v4f16 = fcmp olt <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 48 for: %v8f16 = fcmp olt <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 96 for: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 12 for: %v2f32 = fcmp olt <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 24 for: %v4f32 = fcmp olt <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 48 for: %v8f32 = fcmp olt <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 96 for: %v16f32 = fcmp olt <16 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 12 for: %v2f64 = fcmp olt <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 24 for: %v4f64 = fcmp olt <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 48 for: %v8f64 = fcmp olt <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fcmp'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = fcmp olt <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = fcmp olt <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = fcmp olt <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:66 Lat:68 SizeLat:68 for: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = fcmp olt <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = fcmp olt <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:34 Lat:36 SizeLat:36 for: %v8f32 = fcmp olt <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:68 Lat:72 SizeLat:72 for: %v16f32 = fcmp olt <16 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = fcmp olt <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = fcmp olt <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f64 = fcmp olt <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f16 = fcmp olt <2 x half> undef, undef
   %v4f16 = fcmp olt <4 x half> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
index fa18f4724c39..5a23ebf0c2b8 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
@@ -1,37 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @masked_gather() {
 ; CHECK-LABEL: 'masked_gather'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 96 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 24 for: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 34 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 192 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 320 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 144 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
   %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
@@ -72,32 +72,32 @@ define i32 @masked_gather() {
 
 define i32 @masked_scatter() {
 ; CHECK-LABEL: 'masked_scatter'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 96 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 24 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 34 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 192 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 320 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 144 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
   call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
@@ -136,29 +136,29 @@ define i32 @masked_scatter() {
 
 define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i32, ptr %base, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i32, ptr %base, <4 x i32> %indsext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepu = getelementptr i32, ptr %base, <4 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
   %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
@@ -195,29 +195,29 @@ define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4
 
 define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4f32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep2 = getelementptr float, ptr %base, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep3 = getelementptr float, ptr %base, <4 x i32> %indsext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gepu = getelementptr float, ptr %base, <4 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
   %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
@@ -254,27 +254,27 @@ define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4
 
 define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> %gep1, i32 2, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> %gep2, i32 2, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> %gep3, i32 2, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i16, ptr %base, <4 x i16> %ind16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <4 x i16> %res5 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <4 x i16> %res6 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> %gep1, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i16, ptr %base, <4 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> %gep2, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i16, ptr %base, <4 x i32> %indsext
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> %gep3, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i16, ptr %base, <4 x i16> %ind16
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res5zext = zext <4 x i16> %res5 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res6sext = sext <4 x i16> %res6 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
   %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
@@ -308,16 +308,16 @@ define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %
 
 define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res5zext = zext <4 x i8> %res5 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res6sext = sext <4 x i8> %res6 to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %res5zext = zext <4 x i8> %res5 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %res6sext = sext <4 x i8> %res6 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; result zext
   %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
@@ -337,36 +337,36 @@ define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 
 define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, ptr %base, <8 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i16, ptr %base, <8 x i32> %indzext4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> %gep4, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ressext = sext <8 x i16> %res to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %restrunc = trunc <8 x i32> %ressext to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> %gep4, i32 4, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i16> %ind16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i16, ptr %base, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <8 x i16> %ind16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i16, ptr %base, <8 x i32> %indsext
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep4 = getelementptr i16, ptr %base, <8 x i32> %indzext4
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> %gep4, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %ressext = sext <8 x i16> %res to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: %restrunc = trunc <8 x i32> %ressext to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> %gep4, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
   %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
@@ -418,28 +418,28 @@ define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8
 
 define void @gep_v8f16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, ptr %base, <8 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i16> %ind16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep2 = getelementptr half, ptr %base, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <8 x i16> %ind16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep3 = getelementptr half, ptr %base, <8 x i32> %indsext
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
   %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
@@ -479,17 +479,17 @@ define void @gep_v8f16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8
 
 define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %indzext = zext <8 x i8> %ind8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <8 x i8> %res5 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <8 x i8> %res6 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i8> %ind8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i8, ptr %base, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res5zext = zext <8 x i8> %res5 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res6sext = sext <8 x i8> %res6 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; result zext
   %indzext = zext <8 x i8> %ind8 to <8 x i32>
@@ -510,26 +510,26 @@ define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 
 define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind32, <16 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v16i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> %gep1, i32 2, <16 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, ptr %base, <16 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> %gep2, i32 2, <16 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, ptr %base, <16 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> %gep3, i32 2, <16 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, ptr %base16, <16 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x ptr> %gepbs to <16 x ptr>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbsb, i32 2, <16 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep4 = getelementptr i8, ptr %base, <16 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> %gep4, i32 2, <16 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> %gep1, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <16 x i8> %ind8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i8, ptr %base, <16 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> %gep2, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <16 x i8> %ind8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i8, ptr %base, <16 x i32> %indsext
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> %gep3, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <16 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <16 x ptr> %gepbs to <16 x ptr>
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbsb, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep4 = getelementptr i8, ptr %base, <16 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> %gep4, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
   %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
@@ -565,10 +565,10 @@ define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind3
 
 define void @gep_v16i8p(<16 x ptr> %base, i32 %off, <16 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v16i8p'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbs, i32 2, <16 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbs, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
   %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
index 01341e4dcb64..e4cc8fed5052 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=MVE-RECIP,MVEI-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=MVE-SIZE,MVEI-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=MVE-RECIP,MVEF-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=MVE-SIZE,MVEF-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=MVE,MVEI
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=MVE,MVEF
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -33,55 +31,30 @@ declare <32 x i8>  @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.smin.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @smin(i32 %arg) {
-; MVE-RECIP-LABEL: 'smin'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-SIZE-LABEL: 'smin'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; MVE-LABEL: 'smin'
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:19 Lat:38 SizeLat:38 for: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:38 Lat:76 SizeLat:76 for: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:76 Lat:152 SizeLat:152 for: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -140,55 +113,30 @@ declare <32 x i8>  @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.smax.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @smax(i32 %arg) {
-; MVE-RECIP-LABEL: 'smax'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-SIZE-LABEL: 'smax'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; MVE-LABEL: 'smax'
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:19 Lat:38 SizeLat:38 for: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:38 Lat:76 SizeLat:76 for: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:76 Lat:152 SizeLat:152 for: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -248,55 +196,30 @@ declare <32 x i8>  @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.umin.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @umin(i32 %arg) {
-; MVE-RECIP-LABEL: 'umin'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-SIZE-LABEL: 'umin'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; MVE-LABEL: 'umin'
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:19 Lat:38 SizeLat:38 for: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:38 Lat:76 SizeLat:76 for: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:76 Lat:152 SizeLat:152 for: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -354,56 +277,31 @@ declare <16 x i8>  @llvm.umax.v16i8(<16 x i8>, <16 x i8>)
 declare <32 x i8>  @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.umax.v64i8(<64 x i8>, <64 x i8>)
 
-define i32 @sub(i32 %arg) {
-; MVE-RECIP-LABEL: 'sub'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-SIZE-LABEL: 'sub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+define i32 @umax(i32 %arg) {
+; MVE-LABEL: 'umax'
+; MVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:3 SizeLat:3 for: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:19 Lat:38 SizeLat:38 for: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:38 Lat:76 SizeLat:76 for: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:76 Lat:152 SizeLat:152 for: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:11 Lat:22 SizeLat:22 for: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -455,77 +353,41 @@ declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
 
 define float @minnum(float %arg) {
-; MVEI-RECIP-LABEL: 'minnum'
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F16 = call half @llvm.minnum.f16(half undef, half undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float undef
+; MVEI-LABEL: 'minnum'
+; MVEI-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F64 = call double @llvm.minnum.f64(double undef, double undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F32 = call float @llvm.minnum.f32(float undef, float undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:32 Lat:176 SizeLat:176 for: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F16 = call half @llvm.minnum.f16(half undef, half undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:32 Lat:176 SizeLat:176 for: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:64 Lat:352 SizeLat:352 for: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float undef
 ;
-; MVEI-SIZE-LABEL: 'minnum'
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.minnum.f16(half undef, half undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float undef
-;
-; MVEF-RECIP-LABEL: 'minnum'
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.minnum.f16(half undef, half undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float undef
-;
-; MVEF-SIZE-LABEL: 'minnum'
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.minnum.f16(half undef, half undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float undef
+; MVEF-LABEL: 'minnum'
+; MVEF-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F64 = call double @llvm.minnum.f64(double undef, double undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; MVEF-NEXT:  Cost Model: Found costs of 1 for: %F32 = call float @llvm.minnum.f32(float undef, float undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.minnum.f16(half undef, half undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float undef
 ;
   %F64 = call double @llvm.minnum.f64(double undef, double undef)
   %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
@@ -567,77 +429,41 @@ declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
 
 define float @maxnum(float %arg) {
-; MVEI-RECIP-LABEL: 'maxnum'
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F16 = call half @llvm.maxnum.f16(half undef, half undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEI-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float undef
-;
-; MVEI-SIZE-LABEL: 'maxnum'
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.maxnum.f16(half undef, half undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float undef
-;
-; MVEF-RECIP-LABEL: 'maxnum'
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.maxnum.f16(half undef, half undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEF-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float undef
+; MVEI-LABEL: 'maxnum'
+; MVEI-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F64 = call double @llvm.maxnum.f64(double undef, double undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:32 Lat:176 SizeLat:176 for: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F16 = call half @llvm.maxnum.f16(half undef, half undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:32 Lat:176 SizeLat:176 for: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:64 Lat:352 SizeLat:352 for: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef)
+; MVEI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float undef
 ;
-; MVEF-SIZE-LABEL: 'maxnum'
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.maxnum.f16(half undef, half undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef)
-; MVEF-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float undef
+; MVEF-LABEL: 'maxnum'
+; MVEF-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F64 = call double @llvm.maxnum.f64(double undef, double undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:4 Lat:22 SizeLat:22 for: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:8 Lat:44 SizeLat:44 for: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:16 Lat:88 SizeLat:88 for: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; MVEF-NEXT:  Cost Model: Found costs of 1 for: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; MVEF-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.maxnum.f16(half undef, half undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef)
+; MVEF-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret float undef
 ;
   %F64 = call double @llvm.maxnum.f64(double undef, double undef)
   %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll b/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
index ef0b28ea2604..bd854611080d 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
@@ -1,60 +1,60 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF2
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF2
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -mve-max-interleave-factor=4 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF4
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -mve-max-interleave-factor=4 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF4
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF2
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF2
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -mve-max-interleave-factor=4 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF4
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -mve-max-interleave-factor=4 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF4
 
 define void @vld2(ptr %p) {
 ; CHECK-LABEL: 'vld2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <8 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <16 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <32 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v4i8 = load <4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v8i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v16i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v32i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v4i16 = load <4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v8i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v16i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v32i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v4i32 = load <4 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v8i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v16i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v32i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:130 Lat:260 SizeLat:260 for: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:130 Lat:260 SizeLat:260 for: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v2i64 = load <4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v4i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:384 CodeSize:192 Lat:384 SizeLat:384 for: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:384 CodeSize:192 Lat:384 SizeLat:384 for: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v8i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:768 CodeSize:384 Lat:768 SizeLat:768 for: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:768 CodeSize:384 Lat:768 SizeLat:768 for: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v16i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1536 CodeSize:768 Lat:1536 SizeLat:1536 for: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1536 CodeSize:768 Lat:1536 SizeLat:1536 for: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = load <4 x i8>, ptr %p
   %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
@@ -114,71 +114,71 @@ define void @vld2(ptr %p) {
 
 define void @vld3(ptr %p) {
 ; CHECK-LABEL: 'vld3'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v2i8 = load <6 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %v4i8 = load <12 x i8>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <12 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <24 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <48 x i64>, ptr %p, align 512
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:1 Lat:4 SizeLat:1 for: %v2i8 = load <6 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:1 Lat:4 SizeLat:1 for: %v4i8 = load <12 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v8i8 = load <24 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v16i8 = load <48 x i8>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:1 Lat:4 SizeLat:1 for: %v2i16 = load <6 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v4i16 = load <12 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v8i16 = load <24 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v16i16 = load <48 x i16>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v2i32 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v4i32 = load <12 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v8i32 = load <24 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v16i32 = load <48 x i32>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v2i64 = load <6 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v4i64 = load <12 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v8i64 = load <24 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:1 Lat:4 SizeLat:1 for: %v16i64 = load <48 x i64>, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2048 CodeSize:1024 Lat:2048 SizeLat:2048 for: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2048 CodeSize:1024 Lat:2048 SizeLat:2048 for: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2048 CodeSize:1024 Lat:2048 SizeLat:2048 for: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = load <6 x i8>, ptr %p
   %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
@@ -253,170 +253,170 @@ define void @vld3(ptr %p) {
 
 define void @vld4(ptr %p) {
 ; CHECK-UF2-LABEL: 'vld4'
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 8
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 8
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v4i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v2i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:80 Lat:160 SizeLat:160 for: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:1 Lat:4 SizeLat:1 for: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-UF4-LABEL: 'vld4'
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 8
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 8
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v4i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:258 CodeSize:129 Lat:258 SizeLat:258 for: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %v2i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:65 Lat:130 SizeLat:130 for: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:130 Lat:260 SizeLat:260 for: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:130 Lat:260 SizeLat:260 for: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:130 Lat:260 SizeLat:260 for: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:130 Lat:260 SizeLat:260 for: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:17 Lat:34 SizeLat:34 for: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:33 Lat:66 SizeLat:66 for: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:66 Lat:132 SizeLat:132 for: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:66 Lat:132 SizeLat:132 for: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:66 Lat:132 SizeLat:132 for: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:66 Lat:132 SizeLat:132 for: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:132 Lat:264 SizeLat:264 for: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:132 Lat:264 SizeLat:264 for: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:132 Lat:264 SizeLat:264 for: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:132 Lat:264 SizeLat:264 for: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:160 Lat:320 SizeLat:320 for: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:640 CodeSize:320 Lat:640 SizeLat:640 for: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:4 SizeLat:1 for: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:1280 CodeSize:640 Lat:1280 SizeLat:1280 for: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:1 Lat:4 SizeLat:1 for: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2560 CodeSize:1280 Lat:2560 SizeLat:2560 for: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = load <8 x i8>, ptr %p
   %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
@@ -507,39 +507,39 @@ define void @vld4(ptr %p) {
 
 define void @vst2(ptr %p) {
 ; CHECK-LABEL: 'vst2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> %v4i8, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i16> %v4i16, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> %v4i32, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i64> %v4i64, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i8> %v4i8, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i16> %v4i16, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> %v4i32, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i64> %v4i64, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   store <4 x i8> %v4i8, ptr %p
@@ -583,39 +583,39 @@ define void @vst2(ptr %p) {
 
 define void @vst3(ptr %p) {
 ; CHECK-LABEL: 'vst3'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: store <6 x i8> %v8i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: store <12 x i8> %v16i8, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <24 x i8> %v32i8, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <48 x i8> %v64i8, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: store <6 x i16> %v8i16, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <12 x i16> %v16i16, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <24 x i16> %v32i16, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <48 x i16> %v64i16, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <6 x i32> %v8i32, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <12 x i32> %v16i32, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <24 x i32> %v32i32, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <48 x i32> %v64i32, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <6 x i64> %v8i64, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <12 x i64> %v16i64, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <24 x i64> %v32i64, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %v64i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <48 x i64> %v64i64, ptr %p, align 512
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:48 Lat:96 SizeLat:96 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:1 Lat:1 SizeLat:1 for: store <6 x i8> %v8i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:1 Lat:1 SizeLat:1 for: store <12 x i8> %v16i8, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found costs of RThru:384 CodeSize:192 Lat:384 SizeLat:384 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <24 x i8> %v32i8, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:768 CodeSize:384 Lat:768 SizeLat:768 for: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <48 x i8> %v64i8, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:48 Lat:96 SizeLat:96 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:1 Lat:1 SizeLat:1 for: store <6 x i16> %v8i16, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <12 x i16> %v16i16, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:384 CodeSize:192 Lat:384 SizeLat:384 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <24 x i16> %v32i16, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:768 CodeSize:384 Lat:768 SizeLat:768 for: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <48 x i16> %v64i16, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:48 Lat:96 SizeLat:96 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <6 x i32> %v8i32, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <12 x i32> %v16i32, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:384 CodeSize:192 Lat:384 SizeLat:384 for: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <24 x i32> %v32i32, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:768 CodeSize:384 Lat:768 SizeLat:768 for: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <48 x i32> %v64i32, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <6 x i64> %v8i64, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found costs of RThru:384 CodeSize:192 Lat:384 SizeLat:384 for: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <12 x i64> %v16i64, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found costs of RThru:768 CodeSize:384 Lat:768 SizeLat:768 for: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <24 x i64> %v32i64, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1536 CodeSize:768 Lat:1536 SizeLat:1536 for: %v64i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:1 Lat:1 SizeLat:1 for: store <48 x i64> %v64i64, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
   store <6 x i8> %v8i8, ptr %p
@@ -659,74 +659,74 @@ define void @vst3(ptr %p) {
 
 define void @vst4(ptr %p) {
 ; CHECK-UF2-LABEL: 'vst4'
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <64 x i8> %v64i8, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i16> %v64i16, ptr %p, align 128
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <64 x i32> %v64i32, ptr %p, align 256
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i64> %v64i64, ptr %p, align 512
-; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i8> %v64i8, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i16> %v64i16, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i32> %v64i32, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:2048 CodeSize:1024 Lat:2048 SizeLat:2048 for: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i64> %v64i64, ptr %p, align 512
+; CHECK-UF2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-UF4-LABEL: 'vst4'
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <64 x i8> %v64i8, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i16> %v64i16, ptr %p, align 128
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <64 x i32> %v64i32, ptr %p, align 256
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i64> %v64i64, ptr %p, align 512
-; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i8> %v64i8, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i16> %v64i16, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i32> %v64i32, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:512 CodeSize:256 Lat:512 SizeLat:512 for: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:1024 CodeSize:512 Lat:1024 SizeLat:1024 for: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:2048 CodeSize:1024 Lat:2048 SizeLat:2048 for: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:1 Lat:1 SizeLat:1 for: store <64 x i64> %v64i64, ptr %p, align 512
+; CHECK-UF4-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
   store <8 x i8> %v8i8, ptr %p
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
index 8edc2e6fc156..5f1bce902b93 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @add_i8() {
 ; CHECK-LABEL: 'add_i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
 
@@ -27,32 +27,32 @@ define void @add_i8() {
 
 define void @add_i16() {
 ; CHECK-LABEL: 'add_i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0za = zext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0sa = sext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a1za = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a1sa = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a2za = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a2sa = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3za = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3sa = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a4za = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a4sa = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i16>
   %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
@@ -99,52 +99,52 @@ define void @add_i16() {
 
 define void @add_i32() {
 ; CHECK-LABEL: 'add_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0za = zext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0sa = sext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a1za = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a1sa = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a2za = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a2sa = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a3za = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a3sa = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a4za = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a4sa = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a5za = zext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a5sa = sext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a6za = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a6sa = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7za = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7sa = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a8za = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a8sa = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %a9za = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %a9sa = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i32>
   %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
@@ -221,72 +221,72 @@ define void @add_i32() {
 
 define void @add_i64() {
 ; CHECK-LABEL: 'add_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a0za = zext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a0sa = sext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a1za = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %a1sa = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a2za = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %a2sa = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a3za = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:330 CodeSize:1 Lat:1 SizeLat:1 for: %a3sa = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:298 CodeSize:1 Lat:1 SizeLat:1 for: %a4za = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1322 CodeSize:1 Lat:1 SizeLat:1 for: %a4sa = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a5za = zext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %a5sa = sext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a6za = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %a6sa = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a7za = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %a7sa = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a8za = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:330 CodeSize:1 Lat:1 SizeLat:1 for: %a8sa = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:296 CodeSize:1 Lat:1 SizeLat:1 for: %a9za = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1320 CodeSize:1 Lat:1 SizeLat:1 for: %a9sa = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a10za = zext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a10sa = sext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a11za = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %a11sa = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a12za = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:1 Lat:1 SizeLat:1 for: %a12sa = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %a13za = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:1 Lat:1 SizeLat:1 for: %a13sa = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:288 CodeSize:1 Lat:1 SizeLat:1 for: %a14za = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1056 CodeSize:1 Lat:1 SizeLat:1 for: %a14sa = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i64>
   %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
@@ -393,17 +393,17 @@ define void @add_i64() {
 
 define void @mla_i8() {
 ; CHECK-LABEL: 'mla_i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0m = mul <1 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a1m = mul <2 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2m = mul <4 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3m = mul <8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4m = mul <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a0m = mul <1 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %a1m = mul <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2m = mul <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3m = mul <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a4m = mul <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0m = mul <1 x i8> undef, undef
   %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
@@ -425,57 +425,57 @@ define void @mla_i8() {
 
 define void @mla_i16() {
 ; CHECK-LABEL: 'mla_i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i16> %a0za, %a0zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i16> %a0sa, %a0sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i16> %a1za, %a1zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i16> %a1sa, %a1sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zb = zext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i16> %a2za, %a2zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sb = sext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i16> %a2sa, %a2sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zb = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zm = mul <8 x i16> %a3za, %a3zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sb = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sm = mul <8 x i16> %a3sa, %a3sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4zb = zext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4zm = mul <16 x i16> %a4za, %a4zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sb = sext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4sm = mul <16 x i16> %a4sa, %a4sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5m = mul <1 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a6m = mul <2 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7m = mul <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8m = mul <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9m = mul <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0za = zext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0zb = zext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a0zm = mul <1 x i16> %a0za, %a0zb
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0sa = sext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0sb = sext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a0sm = mul <1 x i16> %a0sa, %a0sb
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a1za = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a1zb = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 26 for: %a1zm = mul <2 x i16> %a1za, %a1zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a1sa = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a1sb = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 26 for: %a1sm = mul <2 x i16> %a1sa, %a1sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a2za = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a2zb = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2zm = mul <4 x i16> %a2za, %a2zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a2sa = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a2sb = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2sm = mul <4 x i16> %a2sa, %a2sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3za = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3zb = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3zm = mul <8 x i16> %a3za, %a3zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3sa = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3sb = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3sm = mul <8 x i16> %a3sa, %a3sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a4za = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a4zb = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a4zm = mul <16 x i16> %a4za, %a4zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a4sa = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a4sb = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a4sm = mul <16 x i16> %a4sa, %a4sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a5m = mul <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %a6m = mul <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7m = mul <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a8m = mul <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a9m = mul <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i16>
   %a0zb = zext <1 x i8> undef to <1 x i16>
@@ -547,97 +547,97 @@ define void @mla_i16() {
 
 define void @mla_i32() {
 ; CHECK-LABEL: 'mla_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i32> %a0za, %a0zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i32> %a0sa, %a0sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i32> %a1za, %a1zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i32> %a1sa, %a1sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2zb = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i32> %a2za, %a2zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sb = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i32> %a2sa, %a2sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3zb = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3zm = mul <8 x i32> %a3za, %a3zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sb = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3sm = mul <8 x i32> %a3sa, %a3sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4zb = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4zm = mul <16 x i32> %a4za, %a4zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sb = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4sm = mul <16 x i32> %a4sa, %a4sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zb = zext <1 x i16> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zm = mul <1 x i32> %a5za, %a5zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sb = sext <1 x i16> undef to <1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sm = mul <1 x i32> %a5sa, %a5sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6zb = zext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a6zm = mul <2 x i32> %a6za, %a6zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sb = sext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %a6sm = mul <2 x i32> %a6sa, %a6sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zb = zext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zm = mul <4 x i32> %a7za, %a7zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sb = sext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sm = mul <4 x i32> %a7sa, %a7sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8zb = zext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8zm = mul <8 x i32> %a8za, %a8zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sb = sext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8sm = mul <8 x i32> %a8sa, %a8sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9zb = zext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9zm = mul <16 x i32> %a9za, %a9zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sb = sext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9sm = mul <16 x i32> %a9sa, %a9sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a10m = mul <1 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a11m = mul <2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12m = mul <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13m = mul <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14m = mul <16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0za = zext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0zb = zext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a0zm = mul <1 x i32> %a0za, %a0zb
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0sa = sext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a0sb = sext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a0sm = mul <1 x i32> %a0sa, %a0sb
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a1za = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a1zb = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 26 for: %a1zm = mul <2 x i32> %a1za, %a1zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a1sa = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a1sb = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 26 for: %a1sm = mul <2 x i32> %a1sa, %a1sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a2za = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a2zb = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2zm = mul <4 x i32> %a2za, %a2zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a2sa = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a2sb = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2sm = mul <4 x i32> %a2sa, %a2sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a3za = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a3zb = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a3zm = mul <8 x i32> %a3za, %a3zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a3sa = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a3sb = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a3sm = mul <8 x i32> %a3sa, %a3sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a4za = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a4zb = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a4zm = mul <16 x i32> %a4za, %a4zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a4sa = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a4sb = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a4sm = mul <16 x i32> %a4sa, %a4sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a5za = zext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a5zb = zext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a5zm = mul <1 x i32> %a5za, %a5zb
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a5sa = sext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a5sb = sext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a5sm = mul <1 x i32> %a5sa, %a5sb
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a6za = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a6zb = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 26 for: %a6zm = mul <2 x i32> %a6za, %a6zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a6sa = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %a6sb = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 26 for: %a6sm = mul <2 x i32> %a6sa, %a6sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7za = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7zb = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7zm = mul <4 x i32> %a7za, %a7zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7sa = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7sb = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7sm = mul <4 x i32> %a7sa, %a7sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a8za = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a8zb = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a8zm = mul <8 x i32> %a8za, %a8zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a8sa = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %a8sb = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a8sm = mul <8 x i32> %a8sa, %a8sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %a9za = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %a9zb = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a9zm = mul <16 x i32> %a9za, %a9zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %a9sa = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %a9sb = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a9sm = mul <16 x i32> %a9sa, %a9sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a10m = mul <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %a11m = mul <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:38 Lat:54 SizeLat:54 for: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a12m = mul <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a13m = mul <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a14m = mul <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i32>
   %a0zb = zext <1 x i8> undef to <1 x i32>
@@ -759,137 +759,137 @@ define void @mla_i32() {
 
 define void @mla_i64() {
 ; CHECK-LABEL: 'mla_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0zb = zext <1 x i8> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0zm = mul <1 x i64> %a0za, %a0zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a0sb = sext <1 x i8> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0sm = mul <1 x i64> %a0sa, %a0sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1zb = zext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a1zm = mul <2 x i64> %a1za, %a1zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sb = sext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a1sm = mul <2 x i64> %a1sa, %a1sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2zb = zext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a2zm = mul <4 x i64> %a2za, %a2zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sb = sext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a2sm = mul <4 x i64> %a2sa, %a2sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3zb = zext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3zm = mul <8 x i64> %a3za, %a3zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sb = sext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3sm = mul <8 x i64> %a3sa, %a3sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4zb = zext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a4zm = mul <16 x i64> %a4za, %a4zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sb = sext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a4sm = mul <16 x i64> %a4sa, %a4sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a5zb = zext <1 x i16> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5zm = mul <1 x i64> %a5za, %a5zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %a5sb = sext <1 x i16> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5sm = mul <1 x i64> %a5sa, %a5sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6zb = zext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a6zm = mul <2 x i64> %a6za, %a6zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sb = sext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a6sm = mul <2 x i64> %a6sa, %a6sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7zb = zext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a7zm = mul <4 x i64> %a7za, %a7zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sb = sext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a7sm = mul <4 x i64> %a7sa, %a7sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8zb = zext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8zm = mul <8 x i64> %a8za, %a8zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sb = sext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8sm = mul <8 x i64> %a8sa, %a8sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9zb = zext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a9zm = mul <16 x i64> %a9za, %a9zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sb = sext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a9sm = mul <16 x i64> %a9sa, %a9sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10zb = zext <1 x i32> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10zm = mul <1 x i64> %a10za, %a10zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %a10sb = sext <1 x i32> undef to <1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10sm = mul <1 x i64> %a10sa, %a10sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11zb = zext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a11zm = mul <2 x i64> %a11za, %a11zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sb = sext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %a11sm = mul <2 x i64> %a11sa, %a11sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12zb = zext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a12zm = mul <4 x i64> %a12za, %a12zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sb = sext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %a12sm = mul <4 x i64> %a12sa, %a12sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13zb = zext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13zm = mul <8 x i64> %a13za, %a13zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sb = sext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13sm = mul <8 x i64> %a13sa, %a13sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14zb = zext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a14zm = mul <16 x i64> %a14za, %a14zb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sb = sext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %a14sm = mul <16 x i64> %a14sa, %a14sb
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a15m = mul <1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a16m = mul <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a17m = mul <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %a18m = mul <8 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %a19m = mul <16 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a0za = zext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a0zb = zext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a0zm = mul <1 x i64> %a0za, %a0zb
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a0sa = sext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a0sb = sext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a0sm = mul <1 x i64> %a0sa, %a0sb
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a1za = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a1zb = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 52 for: %a1zm = mul <2 x i64> %a1za, %a1zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %a1sa = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %a1sb = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 52 for: %a1sm = mul <2 x i64> %a1sa, %a1sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a2za = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a2zb = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 104 for: %a2zm = mul <4 x i64> %a2za, %a2zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %a2sa = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %a2sb = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 104 for: %a2sm = mul <4 x i64> %a2sa, %a2sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a3za = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a3zb = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 208 for: %a3zm = mul <8 x i64> %a3za, %a3zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:330 CodeSize:1 Lat:1 SizeLat:1 for: %a3sa = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:330 CodeSize:1 Lat:1 SizeLat:1 for: %a3sb = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 208 for: %a3sm = mul <8 x i64> %a3sa, %a3sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:298 CodeSize:1 Lat:1 SizeLat:1 for: %a4za = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:298 CodeSize:1 Lat:1 SizeLat:1 for: %a4zb = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 416 for: %a4zm = mul <16 x i64> %a4za, %a4zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1322 CodeSize:1 Lat:1 SizeLat:1 for: %a4sa = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1322 CodeSize:1 Lat:1 SizeLat:1 for: %a4sb = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 416 for: %a4sm = mul <16 x i64> %a4sa, %a4sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a5za = zext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a5zb = zext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a5zm = mul <1 x i64> %a5za, %a5zb
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %a5sa = sext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %a5sb = sext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a5sm = mul <1 x i64> %a5sa, %a5sb
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a6za = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a6zb = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 52 for: %a6zm = mul <2 x i64> %a6za, %a6zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %a6sa = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:10 Lat:20 SizeLat:20 for: %a6sb = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 52 for: %a6sm = mul <2 x i64> %a6sa, %a6sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a7za = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a7zb = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 104 for: %a7zm = mul <4 x i64> %a7za, %a7zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %a7sa = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %a7sb = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 104 for: %a7sm = mul <4 x i64> %a7sa, %a7sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a8za = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %a8zb = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 208 for: %a8zm = mul <8 x i64> %a8za, %a8zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:330 CodeSize:1 Lat:1 SizeLat:1 for: %a8sa = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:330 CodeSize:1 Lat:1 SizeLat:1 for: %a8sb = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 208 for: %a8sm = mul <8 x i64> %a8sa, %a8sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:296 CodeSize:1 Lat:1 SizeLat:1 for: %a9za = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:296 CodeSize:1 Lat:1 SizeLat:1 for: %a9zb = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 416 for: %a9zm = mul <16 x i64> %a9za, %a9zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1320 CodeSize:1 Lat:1 SizeLat:1 for: %a9sa = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1320 CodeSize:1 Lat:1 SizeLat:1 for: %a9sb = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 416 for: %a9sm = mul <16 x i64> %a9sa, %a9sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a10za = zext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a10zb = zext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a10zm = mul <1 x i64> %a10za, %a10zb
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a10sa = sext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:1 Lat:1 SizeLat:1 for: %a10sb = sext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a10sm = mul <1 x i64> %a10sa, %a10sb
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a11za = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a11zb = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 52 for: %a11zm = mul <2 x i64> %a11za, %a11zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %a11sa = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %a11sb = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 52 for: %a11sm = mul <2 x i64> %a11sa, %a11sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a12za = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %a12zb = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 104 for: %a12zm = mul <4 x i64> %a12za, %a12zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:1 Lat:1 SizeLat:1 for: %a12sa = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:1 Lat:1 SizeLat:1 for: %a12sb = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 104 for: %a12sm = mul <4 x i64> %a12sa, %a12sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %a13za = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %a13zb = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 208 for: %a13zm = mul <8 x i64> %a13za, %a13zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:1 Lat:1 SizeLat:1 for: %a13sa = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:264 CodeSize:1 Lat:1 SizeLat:1 for: %a13sb = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 208 for: %a13sm = mul <8 x i64> %a13sa, %a13sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:288 CodeSize:1 Lat:1 SizeLat:1 for: %a14za = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:288 CodeSize:1 Lat:1 SizeLat:1 for: %a14zb = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 416 for: %a14zm = mul <16 x i64> %a14za, %a14zb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1056 CodeSize:1 Lat:1 SizeLat:1 for: %a14sa = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1056 CodeSize:1 Lat:1 SizeLat:1 for: %a14sb = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 416 for: %a14sm = mul <16 x i64> %a14sa, %a14sb
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %a15m = mul <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
+; CHECK-NEXT:  Cost Model: Found costs of 20 for: %a16m = mul <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:76 Lat:108 SizeLat:108 for: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
+; CHECK-NEXT:  Cost Model: Found costs of 40 for: %a17m = mul <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:144 Lat:208 SizeLat:208 for: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: %a18m = mul <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:408 CodeSize:280 Lat:408 SizeLat:408 for: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: %a19m = mul <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:808 CodeSize:552 Lat:808 SizeLat:808 for: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a0za = zext <1 x i8> undef to <1 x i64>
   %a0zb = zext <1 x i8> undef to <1 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/mul64.ll b/llvm/test/Analysis/CostModel/X86/mul64.ll
index 9e4794760404..9af20febeaff 100644
--- a/llvm/test/Analysis/CostModel/X86/mul64.ll
+++ b/llvm/test/Analysis/CostModel/X86/mul64.ll
@@ -327,11 +327,11 @@ define void @mul_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:1 Lat:1 SizeLat:1 for: %xa64 = zext <64 x i8> %a64 to <64 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:1 Lat:1 SizeLat:1 for: %xb64 = zext <64 x i8> %b64 to <64 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %res2 = mul <2 x i64> %xa2, %xb2
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res4 = mul <4 x i64> %xa4, %xb4
-; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %res8 = mul <8 x i64> %xa8, %xb8
-; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %res16 = mul <16 x i64> %xa16, %xb16
-; AVX1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %res32 = mul <32 x i64> %xa32, %xb32
-; AVX1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: %res64 = mul <64 x i64> %xa64, %xb64
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:6 for: %res4 = mul <4 x i64> %xa4, %xb4
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:10 SizeLat:12 for: %res8 = mul <8 x i64> %xa8, %xb8
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:20 SizeLat:24 for: %res16 = mul <16 x i64> %xa16, %xb16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:40 Lat:40 SizeLat:48 for: %res32 = mul <32 x i64> %xa32, %xb32
+; AVX1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:80 Lat:80 SizeLat:96 for: %res64 = mul <64 x i64> %xa64, %xb64
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'mul_zext_vXi8'
@@ -985,11 +985,11 @@ define void @mul_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:1 Lat:1 SizeLat:1 for: %xa64 = zext <64 x i16> %a64 to <64 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:1 Lat:1 SizeLat:1 for: %xb64 = zext <64 x i16> %b64 to <64 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %res2 = mul <2 x i64> %xa2, %xb2
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res4 = mul <4 x i64> %xa4, %xb4
-; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %res8 = mul <8 x i64> %xa8, %xb8
-; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %res16 = mul <16 x i64> %xa16, %xb16
-; AVX1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %res32 = mul <32 x i64> %xa32, %xb32
-; AVX1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: %res64 = mul <64 x i64> %xa64, %xb64
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:6 for: %res4 = mul <4 x i64> %xa4, %xb4
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:10 SizeLat:12 for: %res8 = mul <8 x i64> %xa8, %xb8
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:20 SizeLat:24 for: %res16 = mul <16 x i64> %xa16, %xb16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:40 Lat:40 SizeLat:48 for: %res32 = mul <32 x i64> %xa32, %xb32
+; AVX1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:80 Lat:80 SizeLat:96 for: %res64 = mul <64 x i64> %xa64, %xb64
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'mul_zext_vXi16'
@@ -1643,11 +1643,11 @@ define void @mul_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %xa64 = zext <64 x i32> %a64 to <64 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %xb64 = zext <64 x i32> %b64 to <64 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:5 SizeLat:1 for: %res2 = mul <2 x i64> %xa2, %xb2
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res4 = mul <4 x i64> %xa4, %xb4
-; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %res8 = mul <8 x i64> %xa8, %xb8
-; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %res16 = mul <16 x i64> %xa16, %xb16
-; AVX1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %res32 = mul <32 x i64> %xa32, %xb32
-; AVX1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:1 Lat:1 SizeLat:1 for: %res64 = mul <64 x i64> %xa64, %xb64
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:6 for: %res4 = mul <4 x i64> %xa4, %xb4
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:10 SizeLat:12 for: %res8 = mul <8 x i64> %xa8, %xb8
+; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:20 SizeLat:24 for: %res16 = mul <16 x i64> %xa16, %xb16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:40 Lat:40 SizeLat:48 for: %res32 = mul <32 x i64> %xa32, %xb32
+; AVX1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:80 Lat:80 SizeLat:96 for: %res64 = mul <64 x i64> %xa64, %xb64
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'mul_zext_vXi32'
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index d08b68d3768a..aeeb21ebb320 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -15,7 +15,7 @@
 define void @test_typedbuffer() {
   ; ByteAddressBuffer Buf : register(t8, space1)
   %srv0 = call target("dx.RawBuffer", void, 0, 0)
-      @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, i1 false, ptr @Zero.str)
+      @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr @Zero.str)
   ; CHECK: Resource [[SRV0:[0-9]+]]:
   ; CHECK:   Name: Zero
   ; CHECK:   Binding:
@@ -29,7 +29,7 @@ define void @test_typedbuffer() {
   ; struct S { float4 a; uint4 b; };
   ; StructuredBuffer<S> Buf : register(t2, space4)
   %srv1 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
-      @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 1, i32 0, i1 false, ptr @One.str)
+      @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 1, i32 0, ptr @One.str)
   ; CHECK: Resource [[SRV1:[0-9]+]]:
   ; CHECK:   Name: One
   ; CHECK:   Binding:
@@ -44,7 +44,7 @@ define void @test_typedbuffer() {
 
   ; Buffer<uint4> Buf[24] : register(t3, space5)
   %srv2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-      @llvm.dx.resource.handlefrombinding(i32 5, i32 3, i32 24, i32 0, i1 false, ptr @Two.str)
+      @llvm.dx.resource.handlefrombinding(i32 5, i32 3, i32 24, i32 0, ptr @Two.str)
   ; CHECK: Resource [[SRV2:[0-9]+]]:
   ; CHECK:   Name: Two
   ; CHECK:   Binding:
@@ -59,7 +59,7 @@ define void @test_typedbuffer() {
 
   ; RWBuffer<int> Buf : register(u7, space2)
   %uav0 = call target("dx.TypedBuffer", i32, 1, 0, 1)
-      @llvm.dx.resource.handlefrombinding(i32 2, i32 7, i32 1, i32 0, i1 false, ptr @Three.str)
+      @llvm.dx.resource.handlefrombinding(i32 2, i32 7, i32 1, i32 0, ptr @Three.str)
   ; CHECK: Resource [[UAV0:[0-9]+]]:
   ; CHECK:   Name: Three
   ; CHECK:   Binding:
@@ -77,7 +77,7 @@ define void @test_typedbuffer() {
 
   ; RWBuffer<float4> Buf : register(u5, space3)
   %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.resource.handlefrombinding(i32 3, i32 5, i32 1, i32 0, i1 false, ptr @Four.str)
+      @llvm.dx.resource.handlefrombinding(i32 3, i32 5, i32 1, i32 0, ptr @Four.str)
   call i32 @llvm.dx.resource.updatecounter(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i8 -1)
   ; CHECK: Resource [[UAV1:[0-9]+]]:
   ; CHECK:   Name: Four
@@ -97,10 +97,10 @@ define void @test_typedbuffer() {
   ; RWBuffer<float4> BufferArray[10] : register(u0, space4)
   ; RWBuffer<float4> Buf = BufferArray[0]
   %uav2_1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-        @llvm.dx.resource.handlefrombinding(i32 4, i32 0, i32 10, i32 0, i1 false, ptr @Array.str)
+        @llvm.dx.resource.handlefrombinding(i32 4, i32 0, i32 10, i32 0, ptr @Array.str)
   ; RWBuffer<float4> Buf = BufferArray[5]
   %uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-        @llvm.dx.resource.handlefrombinding(i32 4, i32 0, i32 10, i32 5, i1 false, ptr @Array.str)
+        @llvm.dx.resource.handlefrombinding(i32 4, i32 0, i32 10, i32 5, ptr @Array.str)
   call i32 @llvm.dx.resource.updatecounter(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav2_2, i8 1)
   ; CHECK: Resource [[UAV2:[0-9]+]]:
   ; CHECK:   Name: Array
@@ -119,7 +119,7 @@ define void @test_typedbuffer() {
 
   ; RWBuffer<float4> Buf : register(u0, space5)
   %uav3 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.resource.handlefrombinding(i32 5, i32 0, i32 1, i32 0, i1 false, ptr @Five.str)
+      @llvm.dx.resource.handlefrombinding(i32 5, i32 0, i32 1, i32 0, ptr @Five.str)
   call i32 @llvm.dx.resource.updatecounter(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav3, i8 -1)
   call i32 @llvm.dx.resource.updatecounter(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav3, i8 1)
   ; CHECK: Resource [[UAV3:[0-9]+]]:
@@ -138,7 +138,7 @@ define void @test_typedbuffer() {
   ; CHECK:   Element Count: 4
 
   %cb0 = call target("dx.CBuffer", {float})
-     @llvm.dx.resource.handlefrombinding(i32 1, i32 0, i32 1, i32 0, i1 false, ptr @CB.str)
+     @llvm.dx.resource.handlefrombinding(i32 1, i32 0, i32 1, i32 0, ptr @CB.str)
   ; CHECK: Resource [[CB0:[0-9]+]]:
   ; CHECK:   Name: CB
   ; CHECK:   Binding:
@@ -151,7 +151,7 @@ define void @test_typedbuffer() {
   ; CHECK:   CBuffer size: 4
 
   %cb1 = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
-     @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, i1 false, ptr @Constants.str)
+     @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr @Constants.str)
   ; CHECK: Resource [[CB1:[0-9]+]]:
   ; CHECK:   Name: Constants
   ; CHECK:   Binding:
diff --git a/llvm/test/Analysis/Delinearization/a.ll b/llvm/test/Analysis/Delinearization/a.ll
index 755c9baef9b8..1830a3da7785 100644
--- a/llvm/test/Analysis/Delinearization/a.ll
+++ b/llvm/test/Analysis/Delinearization/a.ll
@@ -11,7 +11,6 @@
 define void @foo(i64 %n, i64 %m, i64 %o, ptr nocapture %A) #0 {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %arrayidx11.us.us, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}(28 + (4 * (-4 + (3 * %m)) * %o)),+,(8 * %m * %o)}<%for.i>,+,(12 * %o)}<%for.j>,+,20}<%for.k>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m][%o] with elements of 4 bytes.
diff --git a/llvm/test/Analysis/Delinearization/byte_offset.ll b/llvm/test/Analysis/Delinearization/byte_offset.ll
index 90b1f03329e4..743dcfcca640 100644
--- a/llvm/test/Analysis/Delinearization/byte_offset.ll
+++ b/llvm/test/Analysis/Delinearization/byte_offset.ll
@@ -13,7 +13,6 @@
 define void @foo(ptr %A, i64 %i2, i64 %arg, i1 %c) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store float 0.000000e+00, ptr %arrayidx, align 4
-; CHECK-NEXT:  In Loop with Header: inner.loop
 ; CHECK-NEXT:  AccessFunction: ({0,+,%i2}<%outer.loop> + %unknown)
 ; CHECK-NEXT:  failed to delinearize
 ;
diff --git a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
index c0b1a0b9cdda..0c0fb4170b14 100644
--- a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
+++ b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -7,14 +7,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
 ; CHECK-LABEL: 'mat_mul'
 ; CHECK-NEXT:  Inst: %tmp = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  In Loop with Header: for.inc
 ; CHECK-NEXT:  AccessFunction: {(4 * %N * %call),+,4}<%for.inc>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%N] with elements of 4 bytes.
 ; CHECK-NEXT:  ArrayRef[%call][{0,+,1}<nuw><nsw><%for.inc>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %tmp5 = load float, ptr %arrayidx4, align 4
-; CHECK-NEXT:  In Loop with Header: for.inc
 ; CHECK-NEXT:  AccessFunction: {(4 * %call1),+,(4 * %N)}<%for.inc>
 ; CHECK-NEXT:  Base offset: %B
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%N] with elements of 4 bytes.
diff --git a/llvm/test/Analysis/Delinearization/divide_by_one.ll b/llvm/test/Analysis/Delinearization/divide_by_one.ll
index 28fe5c50ae77..e812e65ba7fd 100644
--- a/llvm/test/Analysis/Delinearization/divide_by_one.ll
+++ b/llvm/test/Analysis/Delinearization/divide_by_one.ll
@@ -14,14 +14,12 @@ target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-n32"
 define void @test(ptr nocapture %dst, i32 %stride, i32 %bs) {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Inst: %0 = load i8, ptr %arrayidx, align 1
-; CHECK-NEXT:  In Loop with Header: for.body3
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}(-1 + ((1 + %bs) * %stride)),+,(-1 * %stride)}<%for.cond1.preheader>,+,1}<nw><%for.body3>
 ; CHECK-NEXT:  Base offset: %dst
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%stride] with elements of 1 bytes.
 ; CHECK-NEXT:  ArrayRef[{(1 + %bs),+,-1}<nw><%for.cond1.preheader>][{-1,+,1}<nw><%for.body3>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i8 %0, ptr %arrayidx7, align 1
-; CHECK-NEXT:  In Loop with Header: for.body3
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}(%stride * %bs),+,(-1 * %stride)}<%for.cond1.preheader>,+,1}<nsw><%for.body3>
 ; CHECK-NEXT:  Base offset: %dst
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%stride] with elements of 1 bytes.
diff --git a/llvm/test/Analysis/Delinearization/fixed_size_array.ll b/llvm/test/Analysis/Delinearization/fixed_size_array.ll
index 634850bb4a5a..cecd1eacb143 100644
--- a/llvm/test/Analysis/Delinearization/fixed_size_array.ll
+++ b/llvm/test/Analysis/Delinearization/fixed_size_array.ll
@@ -11,7 +11,6 @@
 define void @a_i_j_k(ptr %a) {
 ; CHECK-LABEL: 'a_i_j_k'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1024}<nuw><nsw><%for.i.header>,+,128}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][8][32] with elements of 4 bytes.
@@ -60,7 +59,6 @@ exit:
 define void @a_i_nj_k(ptr %a) {
 ; CHECK-LABEL: 'a_i_nj_k'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}896,+,1024}<nuw><nsw><%for.i.header>,+,-128}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][8][32] with elements of 4 bytes.
@@ -116,14 +114,12 @@ exit:
 define void @a_ijk_b_i2jk(ptr %a, ptr %b) {
 ; CHECK-LABEL: 'a_ijk_b_i2jk'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %a.idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1024}<nuw><nsw><%for.i.header>,+,256}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][4][64] with elements of 4 bytes.
 ; CHECK-NEXT:  ArrayRef[{0,+,1}<nuw><nsw><%for.i.header>][{0,+,1}<nuw><nsw><%for.j.header>][{0,+,1}<nuw><nsw><%for.k>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 1, ptr %b.idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1024}<nuw><nsw><%for.i.header>,+,256}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %b
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][4][64] with elements of 4 bytes.
@@ -180,7 +176,6 @@ exit:
 define void @a_i_2j1_k(ptr %a) {
 ; CHECK-LABEL: 'a_i_2j1_k'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}128,+,1024}<nuw><nsw><%for.i.header>,+,256}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][4][64] with elements of 4 bytes.
@@ -234,7 +229,6 @@ exit:
 define void @a_i_3j_k(ptr %a) {
 ; CHECK-LABEL: 'a_i_3j_k'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1024}<nuw><nsw><%for.i.header>,+,384}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  failed to delinearize
 ;
@@ -286,7 +280,6 @@ exit:
 define void @a_i_j_3k(ptr %a) {
 ; CHECK-LABEL: 'a_i_j_3k'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1024}<nuw><nsw><%for.i.header>,+,128}<nw><%for.j.header>,+,12}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][8][32] with elements of 4 bytes.
@@ -338,7 +331,6 @@ exit:
 define void @a_i_j2k_i(ptr %a) {
 ; CHECK-LABEL: 'a_i_j2k_i'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1028}<%for.i.header>,+,256}<nw><%for.j.header>,+,128}<nw><%for.k>
 ; CHECK-NEXT:  failed to delinearize
 ;
@@ -390,7 +382,6 @@ exit:
 define void @a_i_i_jk(ptr %a) {
 ; CHECK-LABEL: 'a_i_i_jk'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,1152}<%for.i.header>,+,4}<nw><%for.j.header>,+,4}<nw><%for.k>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][288] with elements of 4 bytes.
@@ -441,7 +432,6 @@ exit:
 define void @a_i_jk_l(ptr %a) {
 ; CHECK-LABEL: 'a_i_jk_l'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.l
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{\{}}0,+,1024}<nuw><nsw><%for.i.header>,+,128}<nw><%for.j.header>,+,128}<nw><%for.k.header>,+,4}<nw><%for.l>
 ; CHECK-NEXT:  Base offset: %a
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][8][32] with elements of 4 bytes.
@@ -502,7 +492,6 @@ exit:
 define void @non_divisible_by_element_size(ptr %a) {
 ; CHECK-LABEL: 'non_divisible_by_element_size'
 ; CHECK-NEXT:  Inst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,256}<nuw><nsw><%for.i.header>,+,32}<nw><%for.j.header>,+,1}<nw><%for.k>
 ; CHECK-NEXT:  failed to delinearize
 ;
diff --git a/llvm/test/Analysis/Delinearization/gcd_multiply_expr.ll b/llvm/test/Analysis/Delinearization/gcd_multiply_expr.ll
index 3e4545289a45..a5af30011f48 100644
--- a/llvm/test/Analysis/Delinearization/gcd_multiply_expr.ll
+++ b/llvm/test/Analysis/Delinearization/gcd_multiply_expr.ll
@@ -29,142 +29,114 @@
 define i32 @fn2() {
 ; CHECK-LABEL: 'fn2'
 ; CHECK-NEXT:  Inst: store i32 %storemerge.i, ptr @a, align 4
-; CHECK-NEXT:  In Loop with Header: for.cond2thread-pre-split.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %9 = load i8, ptr %arrayidx.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(%1 * %2),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %10 = load i8, ptr %arrayidx.1.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(1 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.1.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.1.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %11 = load i8, ptr %arrayidx.2.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(2 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.2.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.2.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %12 = load i8, ptr %arrayidx.3.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(3 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.3.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.3.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %13 = load i8, ptr %arrayidx.4.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(4 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.4.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.4.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %14 = load i8, ptr %arrayidx.5.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(5 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.5.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.5.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %15 = load i8, ptr %arrayidx.6.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(6 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.6.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.6.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %16 = load i8, ptr %arrayidx.7.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(7 + (%1 * %2)),+,1}<nw><%for.cond2thread-pre-split.i> + %.pr.i),+,8}<nw><%for.body4.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.7.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.7.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %21 = load i8, ptr %arrayidx.ur.i, align 1
-; CHECK-NEXT:  In Loop with Header: for.body4.ur.i
 ; CHECK-NEXT:  AccessFunction: (sext i32 {({(%1 * %2),+,1}<nw><%for.cond2thread-pre-split.i> + %.ph),+,1}<nw><%for.body4.ur.i> to i64)
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %conv.ur.i, ptr @c, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.ur.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i32 %inc.ur.i, ptr @b, align 4
-; CHECK-NEXT:  In Loop with Header: for.body4.ur.i
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ;
diff --git a/llvm/test/Analysis/Delinearization/himeno_1.ll b/llvm/test/Analysis/Delinearization/himeno_1.ll
index 292dca61d059..5ae5d04505b8 100644
--- a/llvm/test/Analysis/Delinearization/himeno_1.ll
+++ b/llvm/test/Analysis/Delinearization/himeno_1.ll
@@ -32,7 +32,6 @@
 define void @jacobi(i32 %nn, ptr nocapture %a, ptr nocapture %p) nounwind uwtable {
 ; CHECK-LABEL: 'jacobi'
 ; CHECK-NEXT:  Inst: store float 1.000000e+00, ptr %arrayidx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}(4 + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))<nsw>)),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))<nsw>}<%for.j>,+,4}<%for.k>
 ; CHECK-NEXT:  Base offset: %a.base
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][(sext i32 %a.cols to i64)][(sext i32 %a.deps to i64)] with elements of 4 bytes.
diff --git a/llvm/test/Analysis/Delinearization/himeno_2.ll b/llvm/test/Analysis/Delinearization/himeno_2.ll
index d210539d67d8..75e4f027c4c6 100644
--- a/llvm/test/Analysis/Delinearization/himeno_2.ll
+++ b/llvm/test/Analysis/Delinearization/himeno_2.ll
@@ -32,7 +32,6 @@
 define void @jacobi(i32 %nn, ptr nocapture %a, ptr nocapture %p) nounwind uwtable {
 ; CHECK-LABEL: 'jacobi'
 ; CHECK-NEXT:  Inst: store float 1.000000e+00, ptr %arrayidx, align 4
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}(4 + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))<nsw>)),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))<nsw>}<%for.j>,+,4}<%for.k>
 ; CHECK-NEXT:  Base offset: %a.base
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][(sext i32 %a.cols to i64)][(sext i32 %a.deps to i64)] with elements of 4 bytes.
diff --git a/llvm/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll b/llvm/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
index cbe3ec8a19ac..fc0a6c4e8b95 100644
--- a/llvm/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
+++ b/llvm/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
@@ -12,7 +12,6 @@
 define void @foo(i64 %n, i64 %m, i64 %b, ptr %A) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %arrayidx, align 8
-; CHECK-NEXT:  In Loop with Header: for.j
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}(8 * %m * %b),+,(16 * %m)}<%for.i>,+,16}<%for.j>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll b/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
index 3d21d9743846..0493a93dfee9 100644
--- a/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
@@ -12,7 +12,6 @@
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %idx, align 8
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}(56 + (8 * (-4 + (3 * %m)) * %o)),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll b/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
index 3dbd71b1c9ac..2e9c3d77f328 100644
--- a/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
@@ -12,7 +12,6 @@
 define void @foo(i64 %n, i64 %m, i64 %o, i64 %p, ptr nocapture %A) nounwind uwtable {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %arrayidx10.us.us, align 8
-; CHECK-NEXT:  In Loop with Header: for.body6.us.us
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}(56 + (8 * (-4 + (3 * %m)) * (%o + %p))),+,(8 * (%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>,+,(8 * (%o + %p))}<%for.body6.lr.ph.us.us>,+,8}<%for.body6.us.us>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m][(%o + %p)] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll b/llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
index a2d64a5b40bc..a31192ef72f0 100644
--- a/llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -12,7 +12,6 @@
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A, i64 %p, i64 %q, i64 %r) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %idx, align 8
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}(8 * ((((%m * %p) + %q) * %o) + %r)),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d.ll b/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
index ac83ba19b252..432f7af7e069 100644
--- a/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
@@ -12,14 +12,12 @@
 define void @foo(i64 %n, i64 %m, ptr %A) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: %val = load double, ptr %arrayidx, align 8
-; CHECK-NEXT:  In Loop with Header: for.j
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,(8 * %m)}<%for.i>,+,8}<%for.j>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m] with elements of 8 bytes.
 ; CHECK-NEXT:  ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store double %val, ptr %arrayidx, align 8
-; CHECK-NEXT:  In Loop with Header: for.j
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,(8 * %m)}<%for.i>,+,8}<%for.j>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll b/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
index 262a092794cb..c7a2a89e3183 100644
--- a/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
@@ -18,7 +18,6 @@
 define void @foo(i64 %a, i64 %b) nounwind uwtable {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %arrayidx10.us.us, align 8
-; CHECK-NEXT:  In Loop with Header: for.body9.us.us
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,{8,+,8}<%for.cond7.preheader.lr.ph.split.us.us>}<%for.body9.lr.ph.us.us>,+,8}<%for.body9.us.us>
 ; CHECK-NEXT:  failed to delinearize
 ;
diff --git a/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d.ll b/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
index b1405db81a78..966a8222d8a1 100644
--- a/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
@@ -12,7 +12,6 @@
 define void @foo(i64 %n, i64 %m, i64 %o, ptr %A) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %idx, align 8
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m][%o] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll b/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
index 6de072ebaee1..da4082598466 100644
--- a/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
@@ -14,7 +14,6 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @foo(i32 %n, i32 %m, i32 %o, ptr %A) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %idx, align 8
-; CHECK-NEXT:  In Loop with Header: for.k
 ; CHECK-NEXT:  AccessFunction: {{\{\{\{}}0,+,(8 * (zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>,+,(8 * (zext i32 %o to i64))<nuw><nsw>}<%for.j>,+,8}<%for.k>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][(zext i32 %m to i64)][(zext i32 %o to i64)] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll b/llvm/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll
index d7148c521646..da77cd37fede 100644
--- a/llvm/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll
+++ b/llvm/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll
@@ -15,14 +15,12 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @foo(i64 %n, i64 %m, ptr %A) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %arrayidx, align 8
-; CHECK-NEXT:  In Loop with Header: for.j
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,(8 * %m)}<%for.i>,+,8}<%for.j>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%m] with elements of 8 bytes.
 ; CHECK-NEXT:  ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store double 1.000000e+00, ptr %arrayidx1, align 8
-; CHECK-NEXT:  In Loop with Header: for.j
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,8}<%for.i>,+,(8 * %n)}<%for.j>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%n] with elements of 8 bytes.
diff --git a/llvm/test/Analysis/Delinearization/parameter_addrec_product.ll b/llvm/test/Analysis/Delinearization/parameter_addrec_product.ll
index cbccafd1e654..49eeee3bd211 100644
--- a/llvm/test/Analysis/Delinearization/parameter_addrec_product.ll
+++ b/llvm/test/Analysis/Delinearization/parameter_addrec_product.ll
@@ -11,19 +11,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define void @foo(ptr %A, ptr %p) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: %pval = load i64, ptr %p, align 8
-; CHECK-NEXT:  In Loop with Header: bb4
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: %tmp11 = load float, ptr %tmp10, align 4
-; CHECK-NEXT:  In Loop with Header: bb4
 ; CHECK-NEXT:  AccessFunction: (4 * (({0,+,1}<nuw><nsw><%bb2> * %pval)<nsw> + {0,+,1}<nuw><nsw><%bb4>)<nsw>)<nsw>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%pval] with elements of 4 bytes.
 ; CHECK-NEXT:  ArrayRef[{0,+,1}<nuw><nsw><%bb2>][{0,+,1}<nuw><nsw><%bb4>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store float %tmp12, ptr %tmp10, align 4
-; CHECK-NEXT:  In Loop with Header: bb4
 ; CHECK-NEXT:  AccessFunction: (4 * (({0,+,1}<nuw><nsw><%bb2> * %pval)<nsw> + {0,+,1}<nuw><nsw><%bb4>)<nsw>)<nsw>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][%pval] with elements of 4 bytes.
diff --git a/llvm/test/Analysis/Delinearization/terms_with_identity_factor.ll b/llvm/test/Analysis/Delinearization/terms_with_identity_factor.ll
index 86adcc8a1957..5b0465f7fb75 100644
--- a/llvm/test/Analysis/Delinearization/terms_with_identity_factor.ll
+++ b/llvm/test/Analysis/Delinearization/terms_with_identity_factor.ll
@@ -9,14 +9,12 @@
 define void @foo(i32 %m, i32 %n, ptr nocapture %A) #0 {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:  Inst: %4 = load i8, ptr %arrayidx.us, align 1
-; CHECK-NEXT:  In Loop with Header: for.body3.us
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,(sext i32 %n to i64)}<nsw><%for.body3.lr.ph.us>,+,1}<nsw><%for.body3.us>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][(sext i32 %n to i64)] with elements of 1 bytes.
 ; CHECK-NEXT:  ArrayRef[{0,+,1}<nuw><nsw><%for.body3.lr.ph.us>][{0,+,1}<nuw><nsw><%for.body3.us>]
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Inst: store i8 %add4.us, ptr %arrayidx.us, align 1
-; CHECK-NEXT:  In Loop with Header: for.body3.us
 ; CHECK-NEXT:  AccessFunction: {{\{\{}}0,+,(sext i32 %n to i64)}<nsw><%for.body3.lr.ph.us>,+,1}<nsw><%for.body3.us>
 ; CHECK-NEXT:  Base offset: %A
 ; CHECK-NEXT:  ArrayDecl[UnknownSize][(sext i32 %n to i64)] with elements of 1 bytes.
diff --git a/llvm/test/Analysis/Delinearization/type_mismatch.ll b/llvm/test/Analysis/Delinearization/type_mismatch.ll
index 1b78f2448574..6d344975daf9 100644
--- a/llvm/test/Analysis/Delinearization/type_mismatch.ll
+++ b/llvm/test/Analysis/Delinearization/type_mismatch.ll
@@ -12,7 +12,6 @@ target datalayout = "e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"
 define fastcc void @test(i1 %arg, ptr %x) {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Inst: store i8 42, ptr %arrayidx.phi, align 1
-; CHECK-NEXT:  In Loop with Header: for.body11
 ; CHECK-NEXT:  AccessFunction: 0
 ; CHECK-NEXT:  failed to delinearize
 ;
diff --git a/llvm/test/Analysis/DotMachineCFG/AMDGPU/functions.mir b/llvm/test/Analysis/DotMachineCFG/AMDGPU/functions.mir
index b34493aea866..83434c23f478 100644
--- a/llvm/test/Analysis/DotMachineCFG/AMDGPU/functions.mir
+++ b/llvm/test/Analysis/DotMachineCFG/AMDGPU/functions.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-- -run-pass=dot-machine-cfg  -mcfg-dot-filename-prefix=%t -mcfg-func-name=func2 -o -  %s 2>&1 > /dev/null
+# RUN: llc -mtriple=amdgcn-- -run-pass=dot-machine-cfg  -mcfg-dot-filename-prefix=%t -mcfg-func-name=func2 -o -  %s -filetype=null 2>&1
 # RUN: FileCheck %s -input-file=%t.func2.dot --check-prefix=MCFG
 
 # MCFG-NOT: digraph "Machine CFG for 'func1' function"
diff --git a/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir b/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir
index 56ea4b528ba8..e50128e895b2 100644
--- a/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir
+++ b/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir
@@ -1,6 +1,6 @@
-# RUN: llc -mtriple=amdgcn-- -run-pass=dot-machine-cfg  -mcfg-dot-filename-prefix=%t -o -  %s 2>&1 > /dev/null
+# RUN: llc -mtriple=amdgcn-- -run-pass=dot-machine-cfg  -mcfg-dot-filename-prefix=%t -o -  %s -filetype=null 2>&1
 # RUN: FileCheck %s -input-file=%t.irreducible.dot --check-prefix=MCFG
-# RUN: llc -mtriple=amdgcn-- -run-pass=dot-machine-cfg  -mcfg-dot-filename-prefix=%t -dot-mcfg-only -o -  %s 2>&1 > /dev/null
+# RUN: llc -mtriple=amdgcn-- -run-pass=dot-machine-cfg  -mcfg-dot-filename-prefix=%t -dot-mcfg-only -o -  %s -filetype=null 2>&1
 # RUN: FileCheck %s -input-file=%t.irreducible.dot --check-prefix=MCFG-ONLY
 
 # MCFG: digraph "Machine CFG for 'irreducible' function"
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index fe140d01e881..7dec2f8f9690 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -427,6 +427,53 @@ exit:                                              ; preds = %loop
   ret i32 %crc.next
 }
 
+define i16 @crc16.be.tc8.zext.data(i8 %msg, i16 %checksum) {
+; CHECK-LABEL: 'crc16.be.tc8.zext.data'
+; CHECK-NEXT:  Found big-endian CRC-16 loop with trip count 8
+; CHECK-NEXT:    Initial CRC: i16 %checksum
+; CHECK-NEXT:    Generating polynomial: 258
+; CHECK-NEXT:    Computed CRC: %crc.next = select i1 %check.sb, i16 %crc.shl, i16 %crc.xor
+; CHECK-NEXT:    Auxiliary data: i8 %msg
+; CHECK-NEXT:    Computed CRC lookup table:
+; CHECK-NEXT:  0 258 516 774 1032 1290 1548 1806 2064 2322 2580 2838 3096 3354 3612 3870
+; CHECK-NEXT:  4128 4386 4644 4902 5160 5418 5676 5934 6192 6450 6708 6966 7224 7482 7740 7998
+; CHECK-NEXT:  8256 8514 8772 9030 9288 9546 9804 10062 10320 10578 10836 11094 11352 11610 11868 12126
+; CHECK-NEXT:  12384 12642 12900 13158 13416 13674 13932 14190 14448 14706 14964 15222 15480 15738 15996 16254
+; CHECK-NEXT:  16512 16770 17028 17286 17544 17802 18060 18318 18576 18834 19092 19350 19608 19866 20124 20382
+; CHECK-NEXT:  20640 20898 21156 21414 21672 21930 22188 22446 22704 22962 23220 23478 23736 23994 24252 24510
+; CHECK-NEXT:  24768 25026 25284 25542 25800 26058 26316 26574 26832 27090 27348 27606 27864 28122 28380 28638
+; CHECK-NEXT:  28896 29154 29412 29670 29928 30186 30444 30702 30960 31218 31476 31734 31992 32250 32508 32766
+; CHECK-NEXT:  33024 32770 33540 33286 34056 33802 34572 34318 35088 34834 35604 35350 36120 35866 36636 36382
+; CHECK-NEXT:  37152 36898 37668 37414 38184 37930 38700 38446 39216 38962 39732 39478 40248 39994 40764 40510
+; CHECK-NEXT:  41280 41026 41796 41542 42312 42058 42828 42574 43344 43090 43860 43606 44376 44122 44892 44638
+; CHECK-NEXT:  45408 45154 45924 45670 46440 46186 46956 46702 47472 47218 47988 47734 48504 48250 49020 48766
+; CHECK-NEXT:  49536 49282 50052 49798 50568 50314 51084 50830 51600 51346 52116 51862 52632 52378 53148 52894
+; CHECK-NEXT:  53664 53410 54180 53926 54696 54442 55212 54958 55728 55474 56244 55990 56760 56506 57276 57022
+; CHECK-NEXT:  57792 57538 58308 58054 58824 58570 59340 59086 59856 59602 60372 60118 60888 60634 61404 61150
+; CHECK-NEXT:  61920 61666 62436 62182 62952 62698 63468 63214 63984 63730 64500 64246 65016 64762 65532 65278
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+  %data = phi i8 [ %msg, %entry ], [ %data.next, %loop ]
+  %crc = phi i16 [ %checksum, %entry ], [ %crc.next, %loop ]
+  %data.ext = zext i8 %data to i16
+  %xor.crc.data = xor i16 %crc, %data.ext
+  %check.sb = icmp sge i16 %xor.crc.data, 0
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 258
+  %crc.next = select i1 %check.sb, i16 %crc.shl, i16 %crc.xor
+  %data.next = shl i8 %data, 1
+  %iv.next = add nuw nsw i8 %iv, 1
+  %exit.cond = icmp samesign ult i8 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
 ; Negative tests
 
 define i16 @not.crc.non.const.tc(i16 %crc.init, i32 %loop.limit) {
@@ -527,10 +574,10 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
-define i16 @not.crc.tc.limit(i16 %crc.init) {
-; CHECK-LABEL: 'not.crc.tc.limit'
+define i16 @not.crc.tc.exceeds.data.bw(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.tc.exceeds.data.bw'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Unable to find a small constant byte-multiple trip count
+; CHECK-NEXT:  Reason: Loop iterations exceed bitwidth of data
 ;
 entry:
   br label %loop
@@ -543,7 +590,7 @@ loop:                                              ; preds = %loop, %entry
   %check.sb = icmp slt i16 %crc, 0
   %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
   %iv.next = add nuw nsw i32 %iv, 1
-  %exit.cond = icmp samesign ult i32 %iv, 512
+  %exit.cond = icmp samesign ult i32 %iv, 511
   br i1 %exit.cond, label %loop, label %exit
 
 exit:                                              ; preds = %loop
@@ -649,7 +696,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.wrong.sb.check.const(i8 %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.wrong.sb.check.const'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Bad RHS of significant-bit-check
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -676,7 +723,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.wrong.sb.check.pred(i16 %crc.init) {
 ; CHECK-LABEL: 'not.crc.wrong.sb.check.pred'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Bad RHS of significant-bit-check
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -750,7 +797,7 @@ exit:                                              ; preds = %loop
 define i32 @not.crc.unknown.icmp.rhs(i32 %checksum, i32 %msg, i32 %unknown) {
 ; CHECK-LABEL: 'not.crc.unknown.icmp.rhs'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Bad LHS of significant-bit-check
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -777,7 +824,7 @@ exit:                                              ; preds = %loop
 define i32 @not.crc.unknown.icmp.lhs(i32 %checksum, i32 %msg, i32 %unknown) {
 ; CHECK-LABEL: 'not.crc.unknown.icmp.lhs'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Bad LHS of significant-bit-check
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -805,7 +852,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.stray.or(i16 %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.stray.or'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Bad LHS of significant-bit-check
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -833,7 +880,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.inverse.sb.check(i16 %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.inverse.sb.check'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Expected top 16 bits zero (1100000000000001)
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -857,10 +904,10 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
-define i16 @crc1.tc8.sb.check.endian.mismatch(i8 %msg, i16 %checksum) {
-; CHECK-LABEL: 'crc1.tc8.sb.check.endian.mismatch'
+define i16 @not.crc.sb.check.endian.mismatch(i8 %msg, i16 %checksum) {
+; CHECK-LABEL: 'not.crc.sb.check.endian.mismatch'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Bad RHS of significant-bit-check
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -888,7 +935,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.init.arg.inverted.select(i16 %crc.init) {
 ; CHECK-LABEL: 'not.crc.init.arg.inverted.select'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Expected top 8 bits zero (11000000????????)
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -912,7 +959,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.bad.endian.swapped.sb.check(i8 %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.bad.endian.swapped.sb.check'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Found stray unvisited instructions
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -1109,9 +1156,38 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.unknown.value(i16 %msg, i16 %checksum, i16 %corrupt) {
 ; CHECK-LABEL: 'not.crc.unknown.value'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Unknown Value
+; CHECK-NEXT:  Reason: Malformed significant-bit check
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %checksum, %entry ], [ %crc.next, %loop ]
+  %data = phi i16 [ %msg, %entry ], [ %data.next, %loop ]
+  %xor.crc.data = xor i16 %crc, %data
+  %xor.crc.data.corrupt = mul i16 %xor.crc.data, %corrupt
+  %and.crc.data = and i16 %xor.crc.data.corrupt, 1
+  %data.next = lshr i16 %data, 1
+  %check.sb = icmp eq i16 %and.crc.data, 0
+  %crc.lshr = lshr i16 %crc, 1
+  %crc.xor = xor i16 %crc.lshr, -24575
+  %crc.next = select i1 %check.sb, i16 %crc.lshr, i16 %crc.xor
+  %iv.next = add nuw nsw i8 %iv, 1
+  %exit.cond = icmp samesign ult i8 %iv, 15
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+define i16 @not.crc.unknown.call.outside.loop(i16 %msg, i16 %checksum) {
+; CHECK-LABEL: 'not.crc.unknown.call.outside.loop'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
+  %corrupt = call i16 @side.effect()
   br label %loop
 
 loop:                                              ; preds = %loop, %entry
@@ -1134,6 +1210,34 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
+define i16 @not.crc.constant.sb.check.corruption(i16 %msg, i16 %checksum) {
+; CHECK-LABEL: 'not.crc.constant.sb.check.corruption'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Malformed significant-bit check
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %checksum, %entry ], [ %crc.next, %loop ]
+  %data = phi i16 [ %msg, %entry ], [ %data.next, %loop ]
+  %xor.crc.data = xor i16 %crc, %data
+  %xor.crc.data.corrupt = mul i16 %xor.crc.data, 2
+  %and.crc.data = and i16 %xor.crc.data.corrupt, 1
+  %data.next = lshr i16 %data, 1
+  %check.sb = icmp eq i16 %and.crc.data, 0
+  %crc.lshr = lshr i16 %crc, 1
+  %crc.xor = xor i16 %crc.lshr, -24575
+  %crc.next = select i1 %check.sb, i16 %crc.lshr, i16 %crc.xor
+  %iv.next = add nuw nsw i8 %iv, 1
+  %exit.cond = icmp samesign ult i8 %iv, 15
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
 define i16 @not.crc.float.simple.recurrence(float %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.float.simple.recurrence'
 ; CHECK-NEXT:  Did not find a hash algorithm
@@ -1219,7 +1323,7 @@ declare void @print(i16)
 define i16 @not.crc.call.sb.check(i16 %crc.init) {
 ; CHECK-LABEL: 'not.crc.call.sb.check'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Found stray unvisited instructions
+; CHECK-NEXT:  Reason: Malformed significant-bit check
 ;
 entry:
   br label %loop
@@ -1240,4 +1344,108 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
+define i16 @not.crc.bad.lhs.sb.check.be(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.bad.lhs.sb.check.be'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Malformed significant-bit check
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 4129
+  %check.sb = icmp slt i16 %crc.shl, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+define i16 @not.crc.bad.cast.sext(i8 %msg, i16 %checksum) {
+; CHECK-LABEL: 'not.crc.bad.cast.sext'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Recurrences not intertwined with XOR
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+  %data = phi i8 [ %msg, %entry ], [ %data.next, %loop ]
+  %crc = phi i16 [ %checksum, %entry ], [ %crc.next, %loop ]
+  %data.ext = sext i8 %data to i16
+  %xor.crc.data = xor i16 %crc, %data.ext
+  %check.sb = icmp sge i16 %xor.crc.data, 0
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 258
+  %crc.next = select i1 %check.sb, i16 %crc.shl, i16 %crc.xor
+  %data.next = shl i8 %data, 1
+  %iv.next = add nuw nsw i8 %iv, 1
+  %exit.cond = icmp samesign ult i8 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+
+define i16 @not.crc.sb.check.patternmatch.fail(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.sb.check.patternmatch.fail'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Malformed significant-bit check
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %evil.and.iv = and i16 %iv, 2
+  %evil.and.1 = add i16 %evil.and.iv, 1
+  %evil.mul = mul i16 %crc.shl, %evil.and.1
+  %evil.xor = xor i16 %evil.mul, 4129
+  %check.sb = icmp slt i16 %crc, 0
+  %crc.next = select i1 %check.sb, i16 %evil.xor, i16 %evil.mul
+  %iv.next = add nuw nsw i16 %iv, 1
+  %exitcond.not = icmp eq i16 %iv.next, 8
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+define i16 @not.crc.sb.check.patternmatch.fail.call.outside.loop(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.sb.check.patternmatch.fail.call.outside.loop'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Malformed significant-bit check
+;
+entry:
+  %corrupt = call i16 @side.effect()
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %evil.and.corrupt = and i16 %corrupt, 2
+  %evil.and.1 = add i16 %evil.and.corrupt, 1
+  %evil.mul = mul i16 %crc.shl, %evil.and.1
+  %evil.xor = xor i16 %evil.mul, 4129
+  %check.sb = icmp slt i16 %crc, 0
+  %crc.next = select i1 %check.sb, i16 %evil.xor, i16 %evil.mul
+  %iv.next = add nuw nsw i16 %iv, 1
+  %exitcond.not = icmp eq i16 %iv.next, 8
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
 declare i16 @side.effect()
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_type_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_type_vocab.json
index bb97a491dfe8..fcc1344ada31 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_type_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_type_vocab.json
@@ -47,6 +47,7 @@
         "FPTrunc": [0, 0, 0],
         "FPExt": [0, 0, 0],
         "PtrToInt": [0, 0, 0],
+        "PtrToAddr": [0, 0, 0],
         "IntToPtr": [0, 0, 0],
         "BitCast": [0, 0, 0],
         "AddrSpaceCast": [0, 0, 0],
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
index 1b9b3c2acd8a..df7769c9c6a6 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
@@ -67,25 +67,16 @@ Key: InsertValue:  [ 129.00  130.00 ]
 Key: LandingPad:  [ 131.00  132.00 ]
 Key: Freeze:  [ 133.00  134.00 ]
 Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
 Key: VoidTy:  [ 1.50  2.00 ]
 Key: LabelTy:  [ 2.50  3.00 ]
 Key: MetadataTy:  [ 3.50  4.00 ]
-Key: UnknownTy:  [ 4.50  5.00 ]
+Key: VectorTy:  [ 11.50  12.00 ]
 Key: TokenTy:  [ 5.50  6.00 ]
 Key: IntegerTy:  [ 6.50  7.00 ]
 Key: FunctionTy:  [ 7.50  8.00 ]
 Key: PointerTy:  [ 8.50  9.00 ]
 Key: StructTy:  [ 9.50  10.00 ]
 Key: ArrayTy:  [ 10.50  11.00 ]
-Key: VectorTy:  [ 11.50  12.00 ]
-Key: VectorTy:  [ 11.50  12.00 ]
-Key: PointerTy:  [ 8.50  9.00 ]
 Key: UnknownTy:  [ 4.50  5.00 ]
 Key: Function:  [ 0.20  0.40 ]
 Key: Pointer:  [ 0.60  0.80 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
index 9673e7f23fa5..f3ce809fd2fd 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
@@ -67,25 +67,16 @@ Key: InsertValue:  [ 64.50  65.00 ]
 Key: LandingPad:  [ 65.50  66.00 ]
 Key: Freeze:  [ 66.50  67.00 ]
 Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
-Key: FloatTy:  [ 0.50  1.00 ]
 Key: VoidTy:  [ 1.50  2.00 ]
 Key: LabelTy:  [ 2.50  3.00 ]
 Key: MetadataTy:  [ 3.50  4.00 ]
-Key: UnknownTy:  [ 4.50  5.00 ]
+Key: VectorTy:  [ 11.50  12.00 ]
 Key: TokenTy:  [ 5.50  6.00 ]
 Key: IntegerTy:  [ 6.50  7.00 ]
 Key: FunctionTy:  [ 7.50  8.00 ]
 Key: PointerTy:  [ 8.50  9.00 ]
 Key: StructTy:  [ 9.50  10.00 ]
 Key: ArrayTy:  [ 10.50  11.00 ]
-Key: VectorTy:  [ 11.50  12.00 ]
-Key: VectorTy:  [ 11.50  12.00 ]
-Key: PointerTy:  [ 8.50  9.00 ]
 Key: UnknownTy:  [ 4.50  5.00 ]
 Key: Function:  [ 0.50  1.00 ]
 Key: Pointer:  [ 1.50  2.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
index 1f575d29092d..72b25b9bd3d9 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
@@ -67,25 +67,16 @@ Key: InsertValue:  [ 12.90  13.00 ]
 Key: LandingPad:  [ 13.10  13.20 ]
 Key: Freeze:  [ 13.30  13.40 ]
 Key: FloatTy:  [ 0.00  0.00 ]
-Key: FloatTy:  [ 0.00  0.00 ]
-Key: FloatTy:  [ 0.00  0.00 ]
-Key: FloatTy:  [ 0.00  0.00 ]
-Key: FloatTy:  [ 0.00  0.00 ]
-Key: FloatTy:  [ 0.00  0.00 ]
-Key: FloatTy:  [ 0.00  0.00 ]
 Key: VoidTy:  [ 0.00  0.00 ]
 Key: LabelTy:  [ 0.00  0.00 ]
 Key: MetadataTy:  [ 0.00  0.00 ]
-Key: UnknownTy:  [ 0.00  0.00 ]
+Key: VectorTy:  [ 0.00  0.00 ]
 Key: TokenTy:  [ 0.00  0.00 ]
 Key: IntegerTy:  [ 0.00  0.00 ]
 Key: FunctionTy:  [ 0.00  0.00 ]
 Key: PointerTy:  [ 0.00  0.00 ]
 Key: StructTy:  [ 0.00  0.00 ]
 Key: ArrayTy:  [ 0.00  0.00 ]
-Key: VectorTy:  [ 0.00  0.00 ]
-Key: VectorTy:  [ 0.00  0.00 ]
-Key: PointerTy:  [ 0.00  0.00 ]
 Key: UnknownTy:  [ 0.00  0.00 ]
 Key: Function:  [ 0.00  0.00 ]
 Key: Pointer:  [ 0.00  0.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/basic-flowaware.ll b/llvm/test/Analysis/IR2Vec/basic-flowaware.ll
new file mode 100644
index 000000000000..4a7f970a9cf9
--- /dev/null
+++ b/llvm/test/Analysis/IR2Vec/basic-flowaware.ll
@@ -0,0 +1,72 @@
+; RUN: opt -passes='print<ir2vec>' -ir2vec-kind=flow-aware -o /dev/null -ir2vec-vocab-path=%S/Inputs/dummy_3D_nonzero_opc_vocab.json %s 2>&1 | FileCheck %s -check-prefix=3D-CHECK-OPC
+; RUN: opt -passes='print<ir2vec>' -ir2vec-kind=flow-aware -o /dev/null -ir2vec-vocab-path=%S/Inputs/dummy_3D_nonzero_type_vocab.json %s 2>&1 | FileCheck %s -check-prefix=3D-CHECK-TYPE
+; RUN: opt -passes='print<ir2vec>' -ir2vec-kind=flow-aware -o /dev/null -ir2vec-vocab-path=%S/Inputs/dummy_3D_nonzero_arg_vocab.json %s 2>&1 | FileCheck %s -check-prefix=3D-CHECK-ARG
+
+define dso_local noundef float @_Z3abcif(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; 3D-CHECK-OPC: IR2Vec embeddings for function _Z3abcif:
+; 3D-CHECK-OPC-NEXT: Function vector: [ 3630.00 3672.00 3714.00 ]
+; 3D-CHECK-OPC-NEXT: Basic block vectors:
+; 3D-CHECK-OPC-NEXT: Basic block: entry:
+; 3D-CHECK-OPC-NEXT:  [ 3630.00 3672.00 3714.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction vectors:
+; 3D-CHECK-OPC-NEXT: Instruction:   %a.addr = alloca i32, align 4 [ 91.00  92.00  93.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %b.addr = alloca float, align 4 [ 91.00  92.00  93.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   store i32 %a, ptr %a.addr, align 4 [ 188.00 190.00 192.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   store float %b, ptr %b.addr, align 4 [ 188.00 190.00 192.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %0 = load i32, ptr %a.addr, align 4 [ 185.00 187.00 189.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %1 = load i32, ptr %a.addr, align 4 [ 185.00 187.00 189.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %mul = mul nsw i32 %0, %1 [ 419.00  424.00  429.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %conv = sitofp i32 %mul to float [ 549.00  555.00  561.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %2 = load float, ptr %b.addr, align 4 [ 185.00  187.00  189.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   %add = fadd float %conv, %2 [ 774.00  783.00  792.00 ]
+; 3D-CHECK-OPC-NEXT: Instruction:   ret float %add [ 775.00  785.00  795.00 ]
+
+; 3D-CHECK-TYPE: IR2Vec embeddings for function _Z3abcif:
+; 3D-CHECK-TYPE-NEXT: Function vector:  [ 355.50  376.50  397.50 ]
+; 3D-CHECK-TYPE-NEXT: Basic block vectors:
+; 3D-CHECK-TYPE-NEXT: Basic block: entry:
+; 3D-CHECK-TYPE-NEXT:  [ 355.50  376.50  397.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction vectors:
+; 3D-CHECK-TYPE-NEXT: Instruction:   %a.addr = alloca i32, align 4 [ 12.50  13.00  13.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %b.addr = alloca float, align 4 [ 12.50  13.00  13.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   store i32 %a, ptr %a.addr, align 4 [ 14.50  15.50  16.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   store float %b, ptr %b.addr, align 4 [ 14.50  15.50  16.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %0 = load i32, ptr %a.addr, align 4 [ 22.00  23.00  24.00 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %1 = load i32, ptr %a.addr, align 4 [ 22.00  23.00  24.00 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %mul = mul nsw i32 %0, %1 [ 53.50  56.00  58.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %conv = sitofp i32 %mul to float [ 54.00  57.00  60.00 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %2 = load float, ptr %b.addr, align 4 [ 13.00  14.00  15.00 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   %add = fadd float %conv, %2 [ 67.50  72.00  76.50 ]
+; 3D-CHECK-TYPE-NEXT: Instruction:   ret float %add [ 69.50  74.50  79.50 ]
+
+; 3D-CHECK-ARG: IR2Vec embeddings for function _Z3abcif:
+; 3D-CHECK-ARG-NEXT: Function vector:  [ 27.80  31.60  35.40 ]
+; 3D-CHECK-ARG-NEXT: Basic block vectors:
+; 3D-CHECK-ARG-NEXT: Basic block: entry:
+; 3D-CHECK-ARG-NEXT:  [ 27.80  31.60  35.40 ]
+; 3D-CHECK-ARG-NEXT: Instruction vectors:
+; 3D-CHECK-ARG-NEXT: Instruction:   %a.addr = alloca i32, align 4 [ 1.40  1.60  1.80 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %b.addr = alloca float, align 4 [ 1.40  1.60  1.80 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   store i32 %a, ptr %a.addr, align 4 [ 3.40  3.80  4.20 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   store float %b, ptr %b.addr, align 4 [ 3.40  3.80  4.20 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %0 = load i32, ptr %a.addr, align 4 [ 1.40  1.60  1.80 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %1 = load i32, ptr %a.addr, align 4 [ 1.40  1.60  1.80 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %mul = mul nsw i32 %0, %1 [ 2.80  3.20  3.60 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %conv = sitofp i32 %mul to float [ 2.80  3.20  3.60 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %2 = load float, ptr %b.addr, align 4 [ 1.40  1.60  1.80 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   %add = fadd float %conv, %2 [ 4.20  4.80  5.40 ]
+; 3D-CHECK-ARG-NEXT: Instruction:   ret float %add [ 4.20  4.80  5.40 ]
diff --git a/llvm/test/Analysis/IR2Vec/basic.ll b/llvm/test/Analysis/IR2Vec/basic-symbolic.ll
index cb0544fb1986..35abd3c7fa26 100644
--- a/llvm/test/Analysis/IR2Vec/basic.ll
+++ b/llvm/test/Analysis/IR2Vec/basic-symbolic.ll
@@ -1,11 +1,7 @@
 ; RUN: opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/dummy_3D_nonzero_opc_vocab.json %s 2>&1 | FileCheck %s -check-prefix=3D-CHECK-OPC
 ; RUN: opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/dummy_3D_nonzero_type_vocab.json %s 2>&1 | FileCheck %s -check-prefix=3D-CHECK-TYPE
 ; RUN: opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/dummy_3D_nonzero_arg_vocab.json %s 2>&1 | FileCheck %s -check-prefix=3D-CHECK-ARG
-; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab1.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB1-CHECK
-; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab2.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB2-CHECK
-; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab3.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB3-CHECK
-; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab4.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB4-CHECK
- 
+
 define dso_local noundef float @_Z3abcif(i32 noundef %a, float noundef %b) #0 {
 entry:
   %a.addr = alloca i32, align 4
@@ -74,11 +70,3 @@ entry:
 ; 3D-CHECK-ARG-NEXT: Instruction:   %2 = load float, ptr %b.addr, align 4 [ 0.80  1.00  1.20 ]
 ; 3D-CHECK-ARG-NEXT: Instruction:   %add = fadd float %conv, %2 [ 4.00  4.40  4.80 ]
 ; 3D-CHECK-ARG-NEXT: Instruction:   ret float %add [ 2.00  2.20  2.40 ]
-
-; INCORRECT-VOCAB1-CHECK: error: Error reading vocabulary: Missing 'Opcodes' section in vocabulary file
-
-; INCORRECT-VOCAB2-CHECK: error: Error reading vocabulary: Missing 'Types' section in vocabulary file
-
-; INCORRECT-VOCAB3-CHECK: error: Error reading vocabulary: Missing 'Arguments' section in vocabulary file
-
-; INCORRECT-VOCAB4-CHECK: error: Error reading vocabulary: Vocabulary sections have different dimensions
diff --git a/llvm/test/Analysis/IR2Vec/basic-vocab.ll b/llvm/test/Analysis/IR2Vec/basic-vocab.ll
new file mode 100644
index 000000000000..eeeee831814a
--- /dev/null
+++ b/llvm/test/Analysis/IR2Vec/basic-vocab.ll
@@ -0,0 +1,27 @@
+; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab1.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB1-CHECK
+; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab2.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB2-CHECK
+; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab3.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB3-CHECK
+; RUN: not opt -passes='print<ir2vec>' -o /dev/null -ir2vec-vocab-path=%S/Inputs/incorrect_vocab4.json %s 2>&1 | FileCheck %s -check-prefix=INCORRECT-VOCAB4-CHECK
+ 
+define dso_local noundef float @_Z3abcif(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; INCORRECT-VOCAB1-CHECK: error: Error reading vocabulary: Missing 'Opcodes' section in vocabulary file
+
+; INCORRECT-VOCAB2-CHECK: error: Error reading vocabulary: Missing 'Types' section in vocabulary file
+
+; INCORRECT-VOCAB3-CHECK: error: Error reading vocabulary: Missing 'Arguments' section in vocabulary file
+
+; INCORRECT-VOCAB4-CHECK: error: Error reading vocabulary: Vocabulary sections have different dimensions
diff --git a/llvm/test/Analysis/Lint/get-active-lane-mask.ll b/llvm/test/Analysis/Lint/get-active-lane-mask.ll
deleted file mode 100644
index e9b161846e8b..000000000000
--- a/llvm/test/Analysis/Lint/get-active-lane-mask.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt -passes=lint -disable-output < %s 2>&1 | FileCheck %s
-
-define <4 x i1> @t1(i32 %IV) {
-;
-; CHECK:      get_active_lane_mask: operand #2 must be greater than 0
-; CHECK-NEXT: %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 0)
-;
-  %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 0)
-  ret <4 x i1> %res
-}
-
-define <4 x i1> @t2(i32 %IV) {
-;
-; CHECK-NOT: get_active_lane_mask
-; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask
-;
-  %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 1)
-  ret <4 x i1> %res
-}
-
-define <4 x i1> @t3(i32 %IV) {
-;
-; CHECK-NOT: get_active_lane_mask
-; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask
-;
-  %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 -1)
-  ret <4 x i1> %res
-}
-
-define <4 x i1> @t4(i32 %IV, i32 %TC) {
-;
-; CHECK-NOT: get_active_lane_mask
-; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask
-;
-  %res = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %IV, i32 %TC)
-  ret <4 x i1> %res
-}
-
-declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-apply-to-adds.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-apply-to-adds.ll
new file mode 100644
index 000000000000..75014f3a58eb
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-apply-to-adds.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+define void @ptrtoint_based_trip_count_known_via_guards_applied_to_add_subexpr(ptr %start, ptr %end) {
+; CHECK-LABEL: 'ptrtoint_based_trip_count_known_via_guards_applied_to_add_subexpr'
+; CHECK-NEXT:  Determining loop execution counts for: @ptrtoint_based_trip_count_known_via_guards_applied_to_add_subexpr
+; CHECK-NEXT:  Loop %loop: backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %end.i = ptrtoint ptr %end to i64
+  %start.i = ptrtoint ptr %start to i64
+  %sub = sub i64 %end.i, %start.i
+  %pre.1 = icmp eq i64 %sub, 4
+  call void @llvm.assume(i1 %pre.1)
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+  store i32 0, ptr %iv
+  %iv.next = getelementptr inbounds nuw i8, ptr %iv, i64 4
+  %ec = icmp eq ptr %iv.next, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-apply-to-adds.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-apply-to-adds.ll
new file mode 100644
index 000000000000..951b07272dd4
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-apply-to-adds.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+define void @max_btc_improved_by_applying_guards_to_add_subexpr(i32 %low, i32 %high) {
+; CHECK-LABEL: 'max_btc_improved_by_applying_guards_to_add_subexpr'
+; CHECK-NEXT:  Determining loop execution counts for: @max_btc_improved_by_applying_guards_to_add_subexpr
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (zext i32 (1 + (-1 * %low) + %high) to i64))<nsw>
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 7
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (zext i32 (1 + (-1 * %low) + %high) to i64))<nsw>
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %sub = sub i32 %high, %low
+  %pre.1 = icmp slt i32 %sub, 8
+  br i1 %pre.1, label %if.then, label %exit
+
+if.then:
+  %pre.2 = icmp slt i32 %sub, 0
+  br i1 %pre.2, label %exit, label %ph
+
+ph:
+  %add.1 = add i32 %sub, 1
+  %wide.trip.count = zext i32 %add.1 to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll
index 8c77d704eac6..4e5033b7a2f7 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll
@@ -12,9 +12,9 @@ define void @rewrite_zext(i32 %n) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, -8
 ; CHECK-NEXT:    --> (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> U: [0,4294967289) S: [0,4294967289)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %check ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,17) S: [0,17) Exits: (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,17) S: [0,17) Exits: (-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw nsw i64 %index, 8
-; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,25) S: [8,25) Exits: (8 + (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,25) S: [8,25) Exits: (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2
@@ -52,11 +52,11 @@ define i32 @rewrite_zext_min_max(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_min_max
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -98,11 +98,11 @@ define i32 @rewrite_min_max_zext(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %umin, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_min_max_zext
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -144,11 +144,11 @@ define i32 @rewrite_sext_min_max(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nsw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_min_max
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -190,11 +190,11 @@ define i32 @rewrite_min_max_sext(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %smin, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nsw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_min_max_sext
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -288,9 +288,9 @@ define i32 @rewrite_zext_no_icmp_ne(i32 %N) {
 ; CHECK-NEXT:    %n.vec = and i64 %n.rnd.up, 8589934588
 ; CHECK-NEXT:    --> (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw> U: [4,4294967297) S: [4,4294967297)
 ; CHECK-NEXT:    %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,4294967293) S: [0,4294967293) Exits: (4 * ((-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,4294967293) S: [0,4294967293) Exits: (-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,4294967297) S: [4,4294967297) Exits: (4 + (4 * ((-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> /u 4))<nuw><nsw>)<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,4294967297) S: [4,4294967297) Exits: (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_no_icmp_ne
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 1073741823
@@ -328,9 +328,9 @@ define void @rewrite_zext_and_base_1(i32 %n) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, -8
 ; CHECK-NEXT:    --> (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> U: [0,4294967289) S: [0,4294967289)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %check ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw nsw i64 %index, 8
-; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 + (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_and_base_1
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -371,9 +371,9 @@ define void @rewrite_zext_and_base_2(i32 %n) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, -8
 ; CHECK-NEXT:    --> (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> U: [0,4294967289) S: [0,4294967289)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %check ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw nsw i64 %index, 8
-; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 + (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_and_base_2
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -412,9 +412,9 @@ define void @guard_pessimizes_analysis_step2(i1 %c, i32 %N) {
 ; CHECK-NEXT:    %init = phi i64 [ 2, %entry ], [ 4, %bb1 ]
 ; CHECK-NEXT:    --> %init U: [2,5) S: [2,5)
 ; CHECK-NEXT:    %iv = phi i64 [ %iv.next, %loop ], [ %init, %loop.ph ]
-; CHECK-NEXT:    --> {%init,+,2}<nuw><nsw><%loop> U: [2,17) S: [2,17) Exits: ((2 * ((14 + (-1 * %init)<nsw>)<nsw> /u 2))<nuw><nsw> + %init) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%init,+,2}<nuw><nsw><%loop> U: [2,17) S: [2,17) Exits: 14 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 2
-; CHECK-NEXT:    --> {(2 + %init)<nuw><nsw>,+,2}<nuw><nsw><%loop> U: [4,19) S: [4,19) Exits: (2 + (2 * ((14 + (-1 * %init)<nsw>)<nsw> /u 2))<nuw><nsw> + %init) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(2 + %init)<nuw><nsw>,+,2}<nuw><nsw><%loop> U: [4,19) S: [4,19) Exits: 16 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @guard_pessimizes_analysis_step2
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((14 + (-1 * %init)<nsw>)<nsw> /u 2)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 6
@@ -456,11 +456,11 @@ define i32 @rewrite_sext_slt_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_slt_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -500,11 +500,11 @@ define i32 @rewrite_zext_ult_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_ult_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -544,11 +544,11 @@ define i32 @rewrite_zext_ule_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_ule_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -588,11 +588,11 @@ define i32 @rewrite_zext_sle_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_sle_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -632,11 +632,11 @@ define i32 @rewrite_zext_uge_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_uge_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -676,11 +676,11 @@ define i32 @rewrite_sext_sge_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_sge_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -720,11 +720,11 @@ define i32 @rewrite_zext_ugt_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_ugt_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -764,11 +764,11 @@ define i32 @rewrite_sext_sgt_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_sgt_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -808,9 +808,9 @@ define void @rewrite_add_rec() {
 ; CHECK-NEXT:    %n.vec = and i64 %sub, -2
 ; CHECK-NEXT:    --> (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw> U: [0,9) S: [0,9) Exits: 0 LoopDispositions: { %outer.header: Computable, %inner: Invariant }
 ; CHECK-NEXT:    %inner.iv = phi i64 [ 0, %inner.ph ], [ %inner.iv.next, %inner ]
-; CHECK-NEXT:    --> {0,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 * ((-2 + (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw>)<nsw> /u 2))<nuw> LoopDispositions: { %inner: Computable, %outer.header: Variant }
+; CHECK-NEXT:    --> {0,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (-2 + (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw>)<nsw> LoopDispositions: { %inner: Computable, %outer.header: Variant }
 ; CHECK-NEXT:    %inner.iv.next = add i64 %inner.iv, 2
-; CHECK-NEXT:    --> {2,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 + (2 * ((-2 + (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw>)<nsw> /u 2))<nuw>) LoopDispositions: { %inner: Computable, %outer.header: Variant }
+; CHECK-NEXT:    --> {2,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw> LoopDispositions: { %inner: Computable, %outer.header: Variant }
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%outer.header> U: [1,11) S: [1,11) Exits: 10 LoopDispositions: { %outer.header: Computable, %inner: Invariant }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_add_rec
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
index 9bf2427eddb9..4024c986dd11 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
@@ -1231,7 +1231,7 @@ define void @optimized_range_check_unsigned3(ptr %pred, i1 %c) {
 ; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%loop> U: [0,3) S: [0,3) Exits: (-1 + %N)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i16, ptr %pred, i32 %iv
-; CHECK-NEXT:    --> {%pred,+,2}<nuw><%loop> U: full-set S: full-set Exits: ((2 * (zext i32 (-1 + %N)<nsw> to i64))<nuw><nsw> + %pred) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%pred,+,2}<nuw><%loop> U: full-set S: full-set Exits: ((zext i32 (-2 + (2 * %N)<nuw><nsw>)<nsw> to i64) + %pred) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add nuw nsw i32 %iv, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop> U: [1,4) S: [1,4) Exits: %N LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @optimized_range_check_unsigned3
@@ -1390,9 +1390,9 @@ define void @ptr_induction_eq_2(ptr %a, i64 %n) {
 ; CHECK-NEXT:    %b = getelementptr inbounds ptr, ptr %a, i64 %n
 ; CHECK-NEXT:    --> ((8 * %n)<nsw> + %a) U: full-set S: full-set
 ; CHECK-NEXT:    %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
-; CHECK-NEXT:    --> {%a,+,8}<nuw><%loop> U: full-set S: full-set Exits: ((8 * ((-8 + (8 * %n)<nsw>) /u 8))<nuw> + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%a,+,8}<nuw><%loop> U: full-set S: full-set Exits: (-8 + (8 * %n)<nsw> + %a) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8
-; CHECK-NEXT:    --> {(8 + %a),+,8}<nuw><%loop> U: full-set S: full-set Exits: (8 + (8 * ((-8 + (8 * %n)<nsw>) /u 8))<nuw> + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(8 + %a),+,8}<nuw><%loop> U: full-set S: full-set Exits: ((8 * %n)<nsw> + %a) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_eq_2
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * %n)<nsw>) /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
diff --git a/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
new file mode 100644
index 000000000000..8dd8ec47e709
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s
+
+declare void @use(ptr)
+
+define void @udiv4_and_udiv2(i1 %c, ptr %A) {
+; CHECK-LABEL: 'udiv4_and_udiv2'
+; CHECK-NEXT:  Classifying expressions for: @udiv4_and_udiv2
+; CHECK-NEXT:    %start = select i1 %c, i32 512, i32 0
+; CHECK-NEXT:    --> %start U: [0,513) S: [0,513)
+; CHECK-NEXT:    %div.2 = lshr i32 %start, 1
+; CHECK-NEXT:    --> (%start /u 2) U: [0,257) S: [0,257)
+; CHECK-NEXT:    %div.4 = lshr i32 %start, 2
+; CHECK-NEXT:    --> (%start /u 4) U: [0,129) S: [0,129)
+; CHECK-NEXT:    %iv.start = zext i32 %div.4 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 4) U: [0,129) S: [0,129)
+; CHECK-NEXT:    %wide.trip.count = zext i32 %div.2 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 2) U: [0,257) S: [0,257)
+; CHECK-NEXT:    %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {((zext i32 %start to i64) /u 4),+,1}<%loop> U: full-set S: full-set Exits: ((zext i32 %start to i64) /u 2) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.8 = getelementptr i8, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {(((zext i32 %start to i64) /u 4) + %A),+,1}<%loop> U: full-set S: full-set Exits: (((zext i32 %start to i64) /u 2) + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.16 = getelementptr i16, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((2 * ((zext i32 %start to i64) /u 4))<nuw><nsw> + %A),+,2}<%loop> U: full-set S: full-set Exits: ((zext i32 %start to i64) + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.32 = getelementptr i32, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((zext i32 %start to i64) + %A),+,4}<%loop> U: full-set S: full-set Exits: ((2 * (zext i32 %start to i64))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.40 = getelementptr <{ i32, i8 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((5 * ((zext i32 %start to i64) /u 4))<nuw><nsw> + %A),+,5}<%loop> U: full-set S: full-set Exits: ((5 * ((zext i32 %start to i64) /u 2))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.48 = getelementptr <{ i32, i16 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((6 * ((zext i32 %start to i64) /u 4))<nuw><nsw> + %A),+,6}<%loop> U: full-set S: full-set Exits: ((6 * ((zext i32 %start to i64) /u 2))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 1
+; CHECK-NEXT:    --> {(1 + ((zext i32 %start to i64) /u 4))<nuw><nsw>,+,1}<%loop> U: full-set S: full-set Exits: (1 + ((zext i32 %start to i64) /u 2))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @udiv4_and_udiv2
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 4))<nsw> + ((zext i32 %start to i64) /u 2))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 4))<nsw> + ((zext i32 %start to i64) /u 2))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %start = select i1 %c, i32 512, i32 0
+  %div.2 = lshr i32 %start, 1
+  %div.4 = lshr i32 %start, 2
+  %iv.start = zext i32 %div.4 to i64
+  %wide.trip.count = zext i32 %div.2 to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+  %gep.8 = getelementptr i8, ptr %A, i64 %iv
+  call void @use(ptr %gep.8)
+  %gep.16 = getelementptr i16, ptr %A, i64 %iv
+  call void @use(ptr %gep.16)
+  %gep.32 = getelementptr i32, ptr %A, i64 %iv
+  call void @use(ptr %gep.32)
+  %gep.40 = getelementptr <{i32, i8}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.40)
+  %gep.48 = getelementptr <{i32, i16}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.48)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %wide.trip.count
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @udiv3_and_udiv5_mul_4(i1 %c, ptr %A) {
+; CHECK-LABEL: 'udiv3_and_udiv5_mul_4'
+; CHECK-NEXT:  Classifying expressions for: @udiv3_and_udiv5_mul_4
+; CHECK-NEXT:    %start = select i1 %c, i32 512, i32 0
+; CHECK-NEXT:    --> %start U: [0,513) S: [0,513)
+; CHECK-NEXT:    %div.3 = udiv i32 %start, 3
+; CHECK-NEXT:    --> (%start /u 3) U: [0,171) S: [0,171)
+; CHECK-NEXT:    %div.5 = udiv i32 %start, 5
+; CHECK-NEXT:    --> (%start /u 5) U: [0,103) S: [0,103)
+; CHECK-NEXT:    %iv.start = zext i32 %div.5 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 5) U: [0,103) S: [0,103)
+; CHECK-NEXT:    %wide.trip.count = zext i32 %div.3 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 3) U: [0,171) S: [0,171)
+; CHECK-NEXT:    %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {((zext i32 %start to i64) /u 5),+,1}<%loop> U: full-set S: full-set Exits: ((zext i32 %start to i64) /u 3) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.8 = getelementptr i8, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {(((zext i32 %start to i64) /u 5) + %A),+,1}<%loop> U: full-set S: full-set Exits: (((zext i32 %start to i64) /u 3) + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.16 = getelementptr i16, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((2 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,2}<%loop> U: full-set S: full-set Exits: ((2 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.32 = getelementptr i32, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((4 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.40 = getelementptr <{ i32, i8 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((5 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,5}<%loop> U: full-set S: full-set Exits: ((5 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.48 = getelementptr <{ i32, i16 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((6 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,6}<%loop> U: full-set S: full-set Exits: ((6 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 1
+; CHECK-NEXT:    --> {(1 + ((zext i32 %start to i64) /u 5))<nuw><nsw>,+,1}<%loop> U: full-set S: full-set Exits: (1 + ((zext i32 %start to i64) /u 3))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @udiv3_and_udiv5_mul_4
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 5))<nsw> + ((zext i32 %start to i64) /u 3))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 5))<nsw> + ((zext i32 %start to i64) /u 3))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %start = select i1 %c, i32 512, i32 0
+  %div.3 = udiv i32 %start, 3
+  %div.5 = udiv i32 %start, 5
+  %iv.start = zext i32 %div.5 to i64
+  %wide.trip.count = zext i32 %div.3 to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+  %gep.8 = getelementptr i8, ptr %A, i64 %iv
+  call void @use(ptr %gep.8)
+  %gep.16 = getelementptr i16, ptr %A, i64 %iv
+  call void @use(ptr %gep.16)
+  %gep.32 = getelementptr i32, ptr %A, i64 %iv
+  call void @use(ptr %gep.32)
+  %gep.40 = getelementptr <{i32, i8}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.40)
+  %gep.48 = getelementptr <{i32, i16}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.48)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %wide.trip.count
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll b/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll
index c79befac2fb1..1c108bd7318e 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll
@@ -17,67 +17,67 @@ define i32 @pr58402_large_number_of_zext(ptr %dst) {
 ; CHECK-NEXT:    %add7 = add i32 %i, 4
 ; CHECK-NEXT:    --> (4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [4,5) S: [4,5) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i1 = and i32 %add7, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [4,5) S: [4,5) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [4,5) S: [4,5) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.1 = add i32 %i1, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (8 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i2 = and i32 %add7.1, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (8 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.2 = add i32 %i2, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (12 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i3 = and i32 %add7.2, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (12 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.3 = add i32 %i3, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (16 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i4 = and i32 %add7.3, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (16 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.4 = add i32 %i4, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (20 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i5 = and i32 %add7.4, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (20 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.5 = add i32 %i5, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (24 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i6 = and i32 %add7.5, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (24 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.6 = add i32 %i6, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (28 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i7 = and i32 %add7.6, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (28 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.7 = add i32 %i7, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (32 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i8 = and i32 %add7.7, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (32 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.8 = add i32 %i8, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (36 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i9 = and i32 %add7.8, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (36 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.9 = add i32 %i9, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (40 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i10 = and i32 %add7.9, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (40 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.10 = add i32 %i10, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (44 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i11 = and i32 %add7.10, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (44 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.11 = add i32 %i11, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (48 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i12 = and i32 %add7.11, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (48 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.12 = add i32 %i12, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (52 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i13 = and i32 %add7.12, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (52 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.13 = add i32 %i13, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (56 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i14 = and i32 %add7.13, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (56 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.14 = add i32 %i14, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (60 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i15 = and i32 %add7.14, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (60 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.15 = add i32 %i15, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (64 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i16 = and i32 %add7.15, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (64 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:  Determining loop execution counts for: @pr58402_large_number_of_zext
 ; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
diff --git a/llvm/test/Analysis/ScalarEvolution/zext-signed-addrec.ll b/llvm/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
deleted file mode 100644
index 899d31d266e5..000000000000
--- a/llvm/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: opt -loop-reduce -S < %s | FileCheck %s
-; PR18000
-
-target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@a = global i32 0, align 4
-@b = common global i32 0, align 4
-@e = common global i8 0, align 1
-@d = common global i32 0, align 4
-@c = common global i32 0, align 4
-@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
-
-; Function Attrs: nounwind optsize uwtable
-; CHECK-LABEL: foo
-define i32 @foo() {
-entry:
-  %.pr = load i32, ptr @b, align 4
-  %cmp10 = icmp slt i32 %.pr, 1
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %entry.for.end9_crit_edge
-
-entry.for.end9_crit_edge:                         ; preds = %entry
-  %.pre = load i32, ptr @c, align 4
-  br label %for.end9
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %0 = load i32, ptr @a, align 4
-  %tobool = icmp eq i32 %0, 0
-  br i1 %tobool, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge, label %return.loopexit.split
-
-for.cond1.preheader.for.cond1.preheader.split_crit_edge: ; preds = %for.cond1.preheader.lr.ph, %for.inc8
-  %1 = phi i32 [ %inc, %for.inc8 ], [ %.pr, %for.cond1.preheader.lr.ph ]
-  br label %if.end
-
-; CHECK-LABEL: if.end
-if.end:                                           ; preds = %if.end, %for.cond1.preheader.for.cond1.preheader.split_crit_edge
-
-; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %if.end ], [ 258, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ]
-  %indvars.iv = phi i32 [ 1, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %indvars.iv.next, %if.end ]
-
-  %2 = phi i8 [ 1, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %dec, %if.end ]
-  %conv7 = mul i32 %indvars.iv, 258
-  %shl = and i32 %conv7, 510
-  store i32 %shl, ptr @c, align 4
-
-; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, -258
-  %dec = add i8 %2, -1
-
-  %cmp2 = icmp sgt i8 %dec, -1
-  %indvars.iv.next = add i32 %indvars.iv, -1
-  br i1 %cmp2, label %if.end, label %for.inc8
-
-for.inc8:                                         ; preds = %if.end
-  store i32 0, ptr @d, align 4
-  %inc = add nsw i32 %1, 1
-  store i32 %inc, ptr @b, align 4
-  %cmp = icmp slt i32 %1, 0
-  br i1 %cmp, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge, label %for.cond.for.end9_crit_edge
-
-for.cond.for.end9_crit_edge:                      ; preds = %for.inc8
-  store i8 %dec, ptr @e, align 1
-  br label %for.end9
-
-for.end9:                                         ; preds = %entry.for.end9_crit_edge, %for.cond.for.end9_crit_edge
-  %3 = phi i32 [ %.pre, %entry.for.end9_crit_edge ], [ %shl, %for.cond.for.end9_crit_edge ]
-  %call = tail call i32 (ptr, ...) @printf(ptr @.str, i32 %3) #2
-  br label %return
-
-return.loopexit.split:                            ; preds = %for.cond1.preheader.lr.ph
-  store i8 1, ptr @e, align 1
-  store i32 0, ptr @d, align 4
-  br label %return
-
-return:                                           ; preds = %return.loopexit.split, %for.end9
-  %retval.0 = phi i32 [ 0, %for.end9 ], [ 1, %return.loopexit.split ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind optsize
-declare i32 @printf(ptr nocapture readonly, ...)
-
diff --git a/llvm/test/Analysis/ScalarEvolutionDivision/sdiv.ll b/llvm/test/Analysis/ScalarEvolutionDivision/sdiv.ll
new file mode 100644
index 000000000000..e3cf15eb6353
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolutionDivision/sdiv.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s "-passes=print<scev-division>" -disable-output 2>&1 | FileCheck %s
+
+define noundef i8 @add(i8 %x, i8 %y) {
+; CHECK-LABEL: 'add'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %add, %y
+; CHECK-NEXT:    Numerator: (%x + %y)
+; CHECK-NEXT:    Denominator: %y
+; CHECK-NEXT:    Quotient: 1
+; CHECK-NEXT:    Remainder: %x
+;
+  %add = add i8 %x, %y
+  %div = sdiv i8 %add, %y
+  ret i8 %div
+}
+
+define noundef i8 @mul_add_mul(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: 'mul_add_mul'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %add, %a
+; CHECK-NEXT:    Numerator: ((2 * %c)<nuw><nsw> + (%a * %b)<nuw><nsw>)<nuw><nsw>
+; CHECK-NEXT:    Denominator: %a
+; CHECK-NEXT:    Quotient: %b
+; CHECK-NEXT:    Remainder: (2 * %c)<nuw><nsw>
+;
+  %mul0 = mul nsw nuw i8 %a, %b
+  %mul1 = mul nsw nuw i8 %c, 2
+  %add = add nsw nuw i8 %mul0, %mul1
+  %div = sdiv i8 %add, %a
+  ret i8 %div
+}
+
+define noundef i8 @mul(i8 %x, i8 %y) {
+; CHECK-LABEL: 'mul'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %mul, %y
+; CHECK-NEXT:    Numerator: (%x * %y)
+; CHECK-NEXT:    Denominator: %y
+; CHECK-NEXT:    Quotient: %x
+; CHECK-NEXT:    Remainder: 0
+;
+  %mul = mul i8 %x, %y
+  %div = sdiv i8 %mul, %y
+  ret i8 %div
+}
+
+define noundef i8 @add_mul_add(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: 'add_mul_add'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %mul, %a
+; CHECK-NEXT:    Numerator: ((%a + %b)<nuw><nsw> * (%a + %c)<nuw><nsw>)<nuw><nsw>
+; CHECK-NEXT:    Denominator: %a
+; CHECK-NEXT:    Quotient: 0
+; CHECK-NEXT:    Remainder: ((%a + %b)<nuw><nsw> * (%a + %c)<nuw><nsw>)<nuw><nsw>
+;
+  %add0 = add nsw nuw i8 %a, %b
+  %add1 = add nsw nuw i8 %a, %c
+  %mul = mul nsw nuw i8 %add0, %add1
+  %div = sdiv i8 %mul, %a
+  ret i8 %div
+}
+
+; for (i = 0; i < n; i++)
+;   div = i / den;
+;
+define void @addrec_iv(i8 %n, i8 %den) {
+; CHECK-LABEL: 'addrec_iv'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %i, %den
+; CHECK-NEXT:    Numerator: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:    Denominator: %den
+; CHECK-NEXT:    Quotient: 0
+; CHECK-NEXT:    Remainder: {0,+,1}<nuw><nsw><%loop>
+;
+entry:
+  %guard = icmp sgt i8 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+  %div = sdiv i8 %i, %den
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < n; i++)
+;   div = (step * i) / step;
+;
+define void @addrec_step0(i8 %n, i8 %step) {
+; CHECK-LABEL: 'addrec_step0'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %num, %step
+; CHECK-NEXT:    Numerator: {0,+,%step}<nuw><nsw><%loop>
+; CHECK-NEXT:    Denominator: %step
+; CHECK-NEXT:    Quotient: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:    Remainder: 0
+;
+entry:
+  %guard = icmp sgt i8 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+  %num = phi i8 [ 0, %entry ], [ %num.next, %loop ]
+  %div = sdiv i8 %num, %step
+  %i.inc = add nsw i8 %i, 1
+  %num.next = add nsw nuw i8 %num, %step
+  %exitcond = icmp eq i8 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (unsigned char i = 0; i < n; i++)
+;   if (cond)
+;     div = (step * i) / step;
+;
+; FIXME: The quotient can cause signed wrap, e.g., when %step is 0 and %n is
+; larger than 127.
+define void @addrec_step1(i8 %n, i8 %step) {
+; CHECK-LABEL: 'addrec_step1'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %num, %step
+; CHECK-NEXT:    Numerator: {0,+,%step}<nuw><nsw><%loop.header>
+; CHECK-NEXT:    Denominator: %step
+; CHECK-NEXT:    Quotient: {0,+,1}<nuw><nsw><%loop.header>
+; CHECK-NEXT:    Remainder: 0
+;
+entry:
+  %guard = icmp ne i8 %n, 0
+  br i1 %guard, label %loop.header, label %exit
+
+loop.header:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %num = phi i8 [ 0, %entry ], [ %num.next, %loop.latch ]
+  %cond = freeze i1 poison
+  br i1 %cond, label %division, label %loop.latch
+
+division:
+  %div = sdiv i8 %num, %step
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw i8 %i, 1
+  %num.next = add nsw nuw i8 %num, %step
+  %exitcond = icmp eq i8 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < n; i++)
+;   div = (a + b) * i / a;
+;
+; FIXME: Both the quotient and the remainder can cause signed/unsigned wrap,
+; e.g., when %a + %b = 0 && %a != 0.
+define void @addrec_a_b(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: 'addrec_a_b'
+; CHECK-NEXT:  Instruction: %div = sdiv i8 %num, %step
+; CHECK-NEXT:    Numerator: {0,+,(%a + %b)}<nuw><nsw><%loop>
+; CHECK-NEXT:    Denominator: (%a + %b)
+; CHECK-NEXT:    Quotient: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:    Remainder: 0
+;
+entry:
+  %guard = icmp sgt i8 %n, 0
+  %step = add nsw nuw i8 %a, %b
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+  %num = phi i8 [ 0, %entry ], [ %num.next, %loop ]
+  %div = sdiv i8 %num, %step
+  %i.inc = add nsw i8 %i, 1
+  %num.next = add nsw nuw i8 %num, %step
+  %exitcond = icmp eq i8 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir
index 029d87b7ede6..c61113b976a6 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/branch-outside-gmir.mir
@@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn-- -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
 
 # CHECK-LABEL: MachineUniformityInfo for function:  @basic
-# CHECK-NEXT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NEXT: CYCLES ASSUMED DIVERGENT:
 # CHECK-NEXT: depth=1: entries(bb.1 bb.3) bb.2
 # CHECK-LABEL: BLOCK bb.1
 # CHECK: DIVERGENT
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir
index 524ea4e920cd..dfa05f44bd7e 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/diverged-entry-basic-gmir.mir
@@ -1,7 +1,7 @@
 # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
 # RUN: llc -mtriple=amdgcn-- -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
 # CHECK-LABEL: MachineUniformityInfo for function:  @divergent_cycle_1
-# CHECK-NEXT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NEXT: CYCLES ASSUMED DIVERGENT:
 # CHECK-NEXT: depth=1: entries(bb.3 bb.1) bb.4 bb.2
 # CHECK-NEXT: CYCLES WITH DIVERGENT EXIT:
 # CHECK-NEXT: depth=2: entries(bb.4 bb.1) bb.2
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir
index 63e0e785ba47..8627db017eaf 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/exit-divergence-gmir.mir
@@ -1,7 +1,7 @@
 # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
 # RUN: llc -mtriple=amdgcn-- -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
 # CHECK-LABEL: MachineUniformityInfo for function:  @basic
-# CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 # CHECK: CYCLES WITH DIVERGENT EXIT:
 # CHECK: depth=1: entries(bb.1 bb.3) bb.2
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir
index 7050f6dd2210..575092d6f4be 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-2-gmir.mir
@@ -7,7 +7,7 @@
 #              |
 #             bb3
 # CHECK-LABEL: MachineUniformityInfo for function:  @cycle_diverge_enter
-# CHECK-NEXT: CYCLES ASSSUMED DIVERGENT:
+# CHECK-NEXT: CYCLES ASSUMED DIVERGENT:
 # CHECK-NEXT: depth=1: entries(bb.2 bb.1)
 # CHECK-NEXT: CYCLES WITH DIVERGENT EXIT:
 # CHECK-NEXT: depth=1: entries(bb.2 bb.1)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir
index a8f853ade996..d799cd2057f4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir
@@ -35,3 +35,73 @@ body:             |
     SI_RETURN
 
 ...
+
+---
+name:            zext_loads
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    %1:_(p0) = G_IMPLICIT_DEF
+    %4:_(p1) = G_IMPLICIT_DEF
+    %6:_(p5) = G_IMPLICIT_DEF
+
+    ; Atomic load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ZEXTLOAD
+    %0:_(s32) = G_ZEXTLOAD %1(p0) :: (load seq_cst (s16) from `ptr undef`)
+
+    ; flat load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ZEXTLOAD
+    %2:_(s32) = G_ZEXTLOAD %1(p0) :: (load (s16) from `ptr undef`)
+
+    ; Gloabal load
+    ; CHECK-NOT: DIVERGENT
+    %3:_(s32) = G_ZEXTLOAD %4(p1) :: (load (s16) from `ptr addrspace(1) undef`, addrspace 1)
+
+    ; Private load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ZEXTLOAD
+    %5:_(s32) = G_ZEXTLOAD %6(p5) :: (volatile load (s16) from `ptr addrspace(5) undef`, addrspace 5)
+    G_STORE %2(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %3(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %5(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %0(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    SI_RETURN
+
+...
+
+---
+name:            sext_loads
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    %1:_(p0) = G_IMPLICIT_DEF
+    %4:_(p1) = G_IMPLICIT_DEF
+    %6:_(p5) = G_IMPLICIT_DEF
+
+    ; Atomic load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_SEXTLOAD
+    %0:_(s32) = G_SEXTLOAD %1(p0) :: (load seq_cst (s16) from `ptr undef`)
+
+    ; flat load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_SEXTLOAD
+    %2:_(s32) = G_SEXTLOAD %1(p0) :: (load (s16) from `ptr undef`)
+
+    ; Gloabal load
+    ; CHECK-NOT: DIVERGENT
+    %3:_(s32) = G_SEXTLOAD %4(p1) :: (load (s16) from `ptr addrspace(1) undef`, addrspace 1)
+
+    ; Private load
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_SEXTLOAD
+    %5:_(s32) = G_SEXTLOAD %6(p5) :: (volatile load (s16) from `ptr addrspace(5) undef`, addrspace 5)
+    G_STORE %2(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %3(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %5(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %0(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    SI_RETURN
+
+...
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/branch-outside.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/branch-outside.ll
index 7fd8ac40e4be..92e98e28c7f8 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/branch-outside.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/branch-outside.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 ; CHECK=LABEL: UniformityInfo for function 'basic':
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK:   depth=1: entries(P T) Q
 define amdgpu_kernel void @basic(i32 %a, i32 %b, i32 %c) {
 entry:
@@ -38,7 +38,7 @@ exit:
 }
 
 ; CHECK=LABEL: UniformityInfo for function 'nested':
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK:  depth=1: entries(P T) Q A C B
 define amdgpu_kernel void @nested(i32 %a, i32 %b, i32 %c) {
 entry:
@@ -88,7 +88,7 @@ exit:
 ;            depth=3: entries(B) D
 ;
 ; CHECK-LABEL: UniformityInfo for function 'irreducible_outer_cycle':
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK:  depth=1: entries(Q R) C B D
 define void @irreducible_outer_cycle(i1 %c1, i1 %c2, i1 %c3) {
 entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll
index fb83b19b95d8..033ecac072cb 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-basic.ll
@@ -2,7 +2,7 @@
 
 define amdgpu_kernel void @divergent_cycle_1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: UniformityInfo for function 'divergent_cycle_1':
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK:   depth=1: entries(R P) S Q
 ; CHECK: CYCLES WITH DIVERGENT EXIT:
 ; CHECK:   depth=2: entries(S P) Q
@@ -40,7 +40,7 @@ exit:
 
 define amdgpu_kernel void @uniform_cycle_1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: UniformityInfo for function 'uniform_cycle_1':
-; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 entry:
  %cond.uni = icmp slt i32 %a, 0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 8dd44eb878e9..35a165db5486 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -10,7 +10,7 @@
 ;; both cycles are reported as converged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_b_p':
-;; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+;; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 
 define amdgpu_kernel void @headers_b_p(i32 %a, i32 %b, i32 %c) {
@@ -68,7 +68,7 @@ exit:
 ;; both cycles are reported as converged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_a_p':
-;; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+;; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 
 define amdgpu_kernel void @headers_a_p(i32 %a, i32 %b, i32 %c) {
@@ -126,7 +126,7 @@ exit:
 ;; only the inner cycle is reported as diverged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK: CYCLES ASSUMED DIVERGENT:
 ;; CHECK:   depth=2: entries(T P) S Q R
 ;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 
@@ -184,7 +184,7 @@ exit:
 ;; Hence the outermost cycle is reported as diverged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_a_t':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK: CYCLES ASSUMED DIVERGENT:
 ;; CHECK:   depth=1: entries(A B) D T S Q P R C
 ;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
index efad77b684a7..6d1a95b71ea5 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
@@ -10,7 +10,7 @@
 ;; the entire cycle is reported as converged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 't_header':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK: CYCLES ASSUMED DIVERGENT:
 ;; CHECK:   depth=1: entries(T P) S Q R
 
 define amdgpu_kernel void @t_header(i32 %a, i32 %b, i32 %c) {
@@ -59,7 +59,7 @@ exit:
 ;; the cycle is reported as converged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'p_header':
-;; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+;; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 
 define amdgpu_kernel void @p_header(i32 %a, i32 %b, i32 %c) {
 entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/exit-divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/exit-divergence.ll
index 2a3ff4166213..784f58eb3fed 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/exit-divergence.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/exit-divergence.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
 ; CHECK=LABEL: UniformityInfo for function 'basic':
-; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ; CHECK: CYCLES WITH DIVERGENT EXIT:
 ; CHECK:   depth=1: entries(P T) Q
 define amdgpu_kernel void @basic(i32 %a, i32 %b, i32 %c) {
@@ -39,7 +39,7 @@ exit:
 }
 
 ; CHECK-LABEL: UniformityInfo for function 'outer_reducible':
-; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ; CHECK: CYCLES WITH DIVERGENT EXIT:
 ; CHECK:   depth=1: entries(H) P T R Q
 define amdgpu_kernel void @outer_reducible(i32 %a, i32 %b, i32 %c) {
@@ -96,7 +96,7 @@ exit:
 ; unless the def itself is divergent.
 ;
 ; CHECK-LABEL: UniformityInfo for function 'no_divergent_exit':
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK:   depth=1: entries(H B) C
 ; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 define amdgpu_kernel void @no_divergent_exit(i32 %n, i32 %a, i32 %b) #0 {
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/irreducible-2.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/irreducible-2.ll
index 6b8e7a1a0bb5..91b444647f8b 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/irreducible-2.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/irreducible-2.ll
@@ -571,7 +571,7 @@ X:
 
 define amdgpu_kernel void @always_uniform() {
 ; CHECK-LABEL: UniformityInfo for function 'always_uniform':
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK:   depth=1: entries(bb2 bb3)
 
 bb:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/reducible-headers.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/reducible-headers.ll
index feb29497f80c..503c6f854ec5 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/reducible-headers.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/reducible-headers.ll
@@ -32,7 +32,7 @@
 
 define amdgpu_kernel void @nested_irreducible(i32 %a, i32 %b, i32 %c) {
 ; CHECK=LABEL: UniformityInfo for function 'nested_irreducible':
-; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ; CHECK: CYCLES WITH DIVERGENT EXIT:
 ; CHECK-DAG:   depth=2: entries(P T) R Q
 ; CHECK-DAG:   depth=1: entries(H) S P T R Q U
@@ -119,7 +119,7 @@ exit:
 
 define amdgpu_kernel void @header_label_1(i32 %a, i32 %b, i32 %c) {
 ; CHECK=LABEL: UniformityInfo for function 'header_label_1':
-; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ; CHECK: CYCLES WITH DIVERGENT EXIT:
 ; CHECK:  depth=1: entries(H) Q P U T R
 entry:
@@ -187,7 +187,7 @@ exit:
 
 define amdgpu_kernel void @header_label_2(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: UniformityInfo for function 'header_label_2':
-; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
+; CHECK-NOT: CYCLES ASSUMED DIVERGENT:
 ; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 entry:
   %cond.uni = icmp slt i32 %a, 0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
index 7466c2396e6f..f5668cef5d63 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -113,11 +113,40 @@ define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr
   ret void
 }
 
+; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1'
+; CHECK-NOT: DIVERGENT
+define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) !reqd_work_group_size !4 {
+  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+  store i32 %id.z, ptr %o
+  ret void
+}
+
+; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_div_wavefront_size'
+; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NOT: DIVERGENT
+define amdgpu_kernel void @workitem_id_x_div_wavefront_size(ptr %o) #3 !reqd_work_group_size !5 {
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.sg = lshr i32 %id.x, 6
+  store i32 %id.sg, ptr %o
+  ret void
+}
+
+; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_uniform_in_subgroup'
+; CHECK-NOT: DIVERGENT
+define amdgpu_kernel void @workitem_id_y_uniform_in_subgroup(ptr %o) #3 !reqd_work_group_size !5 {
+  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+  store i32 %id.y, ptr %o
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { "amdgpu-flat-work-group-size"="1,1" }
+attributes #3 = { "target-cpu"="gfx900" "amdgpu-flat-work-group-size"="256,256" }
 
 !0 = !{i32 1, i32 1, i32 1}
 !1 = !{i32 2, i32 1, i32 1}
 !2 = !{i32 1, i32 2, i32 1}
 !3 = !{i32 1, i32 1, i32 2}
+!4 = !{i32 64, i32 1, i32 1}
+!5 = !{i32 128, i32 2, i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/non-header-join.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/non-header-join.ll
index 5fbf435f2f16..3525313cab2f 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/non-header-join.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/non-header-join.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK: CYCLES ASSUMED DIVERGENT:
 ; CHECK-NEXT: depth=1: entries(if.end16 for.cond1) for.body4
 
 define void @foo(i1 %b) {
author	Mingming Liu <mingmingl@google.com>	2025-09-10 15:25:31 -0700
committer	GitHub <noreply@github.com>	2025-09-10 15:25:31 -0700
commit	1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree	57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/test/Analysis
parent	898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parent	b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)