summaryrefslogtreecommitdiff
path: root/llvm/test/Analysis/BasicAA
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2025-09-10 15:25:31 -0700
committerGitHub <noreply@github.com>2025-09-10 15:25:31 -0700
commit1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/test/Analysis/BasicAA
parent898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parentb8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/test/Analysis/BasicAA')
-rw-r--r--llvm/test/Analysis/BasicAA/featuretest.ll6
-rw-r--r--llvm/test/Analysis/BasicAA/scalable-dse-aa.ll248
2 files changed, 252 insertions, 2 deletions
diff --git a/llvm/test/Analysis/BasicAA/featuretest.ll b/llvm/test/Analysis/BasicAA/featuretest.ll
index fd5d2eff10f8..e4cb009f0c63 100644
--- a/llvm/test/Analysis/BasicAA/featuretest.ll
+++ b/llvm/test/Analysis/BasicAA/featuretest.ll
@@ -102,12 +102,14 @@ define i32 @gep_distance_test(ptr %A) {
; cannot alias, even if there is a variable offset between them...
define i32 @gep_distance_test2(ptr %A, i64 %distance) {
; NO_ASSUME-LABEL: @gep_distance_test2(
-; NO_ASSUME-NEXT: [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]], i32 1
+; NO_ASSUME-NEXT: [[B_SPLIT:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]]
+; NO_ASSUME-NEXT: [[B:%.*]] = getelementptr i8, ptr [[B_SPLIT]], i64 4
; NO_ASSUME-NEXT: store i32 7, ptr [[B]], align 4
; NO_ASSUME-NEXT: ret i32 0
;
; USE_ASSUME-LABEL: @gep_distance_test2(
-; USE_ASSUME-NEXT: [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]], i32 1
+; USE_ASSUME-NEXT: [[B_SPLIT:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 [[DISTANCE:%.*]]
+; USE_ASSUME-NEXT: [[B:%.*]] = getelementptr i8, ptr [[B_SPLIT]], i64 4
; USE_ASSUME-NEXT: store i32 7, ptr [[B]], align 4
; USE_ASSUME-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4), "nonnull"(ptr [[A]]), "align"(ptr [[A]], i64 4) ]
; USE_ASSUME-NEXT: ret i32 0
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
new file mode 100644
index 000000000000..7e980c9bfe38
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -0,0 +1,248 @@
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
+
+define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.32 = getelementptr inbounds nuw i8, ptr %0, i64 32
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <4 x float> @dead_scalable_store_fixed(ptr %0) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+ %mask2 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 3)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.36 = getelementptr inbounds nuw i8, ptr %0, i64 36
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.36 = getelementptr inbounds nuw i8, ptr %arr, i64 36
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
+
+ %load.0.36 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.36, i32 1, <4 x i1> %mask2, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.36, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
+
+ %load.0.48 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+
+ %faddop0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %faddop1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ %fadd = fadd <4 x float> %faddop0, %faddop1
+
+ ret <4 x float> %fadd
+}
+
+define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+;
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+ %gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+ %gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
+
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+ %arr = alloca [64 x i32], align 4
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+
+ %gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
+ %gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
+ %gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
+ %gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
+ %gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
+ %gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
+
+ %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
+
+ %load.0.46 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+
+ %smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
+ %faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
+ %faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ %fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
+
+ ret <vscale x 4 x float> %fadd
+}
+
+define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
+; CHECK-NOT: store i32 20, ptr %gep.1.12
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 20, ptr %gep.1.12
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-NOT: store i32 20, ptr %1
+; CHECK: store i32 50, ptr %gep.5
+define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+ %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+ store i32 20, ptr %1
+
+ %gep.5 = getelementptr inbounds nuw i32, ptr %1, i64 5
+ store i32 50, ptr %gep.5
+
+ %load.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0, ptr nonnull %1, i32 1, <4 x i1> %mask)
+ %retval = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %retval
+}
+
+; We don't know if the scalar store is dead as we can't determine vscale.
+; This get active lane mask may cover 4 or 8 integers
+define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
+; CHECK: store i32 10, ptr %gep.1.12
+; CHECK: store i32 20, ptr %gep.1.28
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 10, ptr %gep.1.12
+ %gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
+ store i32 20, ptr %gep.1.28
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the mask's Op1 < Op0
+define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
+; CHECK: store i32 20, ptr %1
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+ store i32 20, ptr %1
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+; Don't do anything if the mask's Op1 == Op0
+define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
+; CHECK: store i32 20, ptr %1
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
+ store i32 20, ptr %1
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}
+
+define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
+; CHECK-NOT: store i8 60, ptr %gep.1.6
+; CHECK: store i8 120, ptr %gep.1.8
+;
+ %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
+ %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
+ store i8 60, ptr %gep.1.6
+ %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+ store i8 120, ptr %gep.1.8
+
+ %load.0 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
+ %retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
+ ret <vscale x 16 x i8> %retval
+}
+
+define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
+; CHECK-NOT: store i32 10, ptr %gep.1.0
+; CHECK-NOT: store i32 20, ptr %gep.1.4
+; CHECK-NOT: store i32 30, ptr %gep.1.8
+; CHECK: store i32 40, ptr %gep.1.12
+;
+ %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+ %gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
+ store i32 10, ptr %gep.1.0
+ %gep.1.4 = getelementptr inbounds nuw i8, ptr %1, i64 4
+ store i32 20, ptr %gep.1.4
+ %gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
+ store i32 30, ptr %gep.1.8
+ %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
+ store i32 40, ptr %gep.1.12
+
+ %load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
+ %retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+ ret <vscale x 4 x float> %retval
+}