diff options
| author | Florian Mayer <fmayer@google.com> | 2025-10-22 10:55:10 -0700 |
|---|---|---|
| committer | Florian Mayer <fmayer@google.com> | 2025-10-22 10:55:10 -0700 |
| commit | f5f8398d7fe18a968f5873518e87d5fdd8269359 (patch) | |
| tree | 347dff286c3b48b2336fb7a425adfceebd478116 /llvm/test/Transforms/LoopVectorize | |
| parent | 73edaec4a6cd1212f9ae819c413d2cf58216d3b1 (diff) | |
| parent | a0abc0af0a0a90878822f8107d70dad6f7cdfc26 (diff) | |
Created using spr 1.3.7
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
32 files changed, 1623 insertions, 303 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll index 2b5960eb91e5..90d0db2a9acf 100644 --- a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll +++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll @@ -56,7 +56,7 @@ if.then46: ; preds = %for.body40 br label %for.inc50 for.inc50: ; preds = %if.then46, %for.body40 - %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ] + %k.1 = phi i32 [ 0, %for.body40 ], [ %inc47, %if.then46 ] %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ] %indvars.iv.next124 = add i64 %indvars.iv123, 1 %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index 1c2840f7e7ce..70532ad6586c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2 ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64> +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]] -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64> -; CHECK-MAXBW-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]]) +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll index 20b536499afa..ebf4a4fabb60 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll @@ -9,9 +9,7 @@ define i32 @fn1() local_unnamed_addr #0 { ; We expect the backend to expand all reductions. ; CHECK: @llvm.vector.reduce entry: - %0 = load i32, ptr @b, align 4, !tbaa !1 - %cmp40 = icmp sgt i32 %0, 0 - br i1 %cmp40, label %for.body.lr.ph, label %for.end + br label %for.body.lr.ph for.body.lr.ph: ; preds = %entry %1 = load ptr, ptr @a, align 8, !tbaa !5 @@ -21,8 +19,8 @@ for.body.lr.ph: ; preds = %entry for.body: ; preds = %for.body.lr.ph, %for.body %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ] - %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ] + %d.043 = phi i16 [ 0, %for.body.lr.ph ], [ %.sink28, %for.body ] + %c.042 = phi i16 [ 0, %for.body.lr.ph ], [ %c.0., %for.body ] %arrayidx = getelementptr inbounds i16, ptr %1, i64 %indvars.iv %4 = load i16, ptr %arrayidx, align 2, !tbaa !7 %cmp2 = icmp sgt i16 %c.042, %4 @@ -33,10 +31,8 @@ for.body: ; preds = %for.body.lr.ph, %fo %cmp = icmp slt i64 %indvars.iv.next, %3 br i1 %cmp, label %for.body, label %for.end -for.end: ; preds = %for.body, %entry - %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ] - %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ] - %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa +for.end: ; preds = %for.body + %cmp26 = icmp sgt i16 %c.0., %.sink28 %conv27 = zext i1 %cmp26 to i32 ret i32 %conv27 } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll index 44820e061211..33ce300d6859 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -18,7 +18,7 @@ define void @_Z1dv() local_unnamed_addr #0 { ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ] -; CHECK-NEXT: [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ] +; CHECK-NEXT: [[G_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ] ; CHECK-NEXT: [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4 ; CHECK-NEXT: [[CONV:%.*]] = and i32 [[F_0]], 65535 ; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]] @@ -50,7 +50,7 @@ entry: for.cond: ; preds = %for.cond.cleanup, %entry %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ] - %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ] + %g.0 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %for.cond.cleanup ] %cmp12 = icmp ult i32 %g.0, 4 %conv = and i32 %f.0, 65535 br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index 11cc97158677..fb7890a3b82f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load <4 x i32> ; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]] +; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index de804600f811..786a2aab6d0e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -728,8 +728,8 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]]) @@ -755,8 +755,8 @@ for.cond.cleanup: ; preds = %for.body for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] - %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] %a = getelementptr inbounds %struct.IntFloat, ptr %p, i64 %indvars.iv, i32 0 %load1 = load i32, ptr %a, align 4 %add = add nsw i32 %load1, %SumA.013 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll index 005ca8c9b2d9..ae8dc2d02fed 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll @@ -175,28 +175,18 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -237,28 +227,18 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index 2a19402347e4..b23702dc088b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8 -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8 -; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8 -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00) ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8 -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8 -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8 -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP32]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP34]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP38]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP35]], align 8 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]] -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8 -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8 -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8 -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: store <2 x double> [[TMP28]], ptr [[TMP40]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP29]], ptr [[TMP42]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP41]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP31]], ptr [[TMP43]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -435,7 +435,7 @@ exit: ret void } -; FIXME: We should interleave by 2 after narrowing interleave groups to saturate +; We should interleave by 2 after narrowing interleave groups to saturate ; load/store units. define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) { ; CHECK-LABEL: define void @test_interleave_after_narrowing( @@ -447,12 +447,18 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]] ; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll index c26176028626..286549595414 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll @@ -13,12 +13,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) { ; VF2: [[VECTOR_PH]]: ; VF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF2: [[VECTOR_BODY]]: -; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8 -; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> -; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> -; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8 +; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8 +; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8 ; VF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF2: [[MIDDLE_BLOCK]]: ; VF2-NEXT: br label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index 6c36dfb81311..305a6920fad1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -1,29 +1,81 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 -; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s +; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s +; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s target triple = "aarch64-unknown-linux" define void @load_store_interleave_group(ptr noalias %data) { +; IC1-LABEL: define void @load_store_interleave_group( +; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { +; IC1-NEXT: [[ENTRY:.*:]] +; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 +; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]] +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8 +; IC1-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8 +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; ; CHECK-LABEL: define void @load_store_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP24]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8 ; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8 +; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8 +; CHECK-NEXT: store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -53,27 +105,82 @@ exit: } define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) { +; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group( +; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] { +; IC1-NEXT: [[ENTRY:.*:]] +; IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]] +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1 +; IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]] +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8 +; IC1-NEXT: [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]] +; IC1-NEXT: store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8 +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP20]], 1 +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP8]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP2]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]] +; CHECK-NEXT: [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]] ; CHECK-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-NEXT: store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8 +; CHECK-NEXT: store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8 +; CHECK-NEXT: store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -103,3 +210,175 @@ loop: exit: ret void } + +define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) { +; IC1-LABEL: define void @test_masked_interleave_group( +; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; IC1-NEXT: [[ENTRY:.*:]] +; IC1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; IC1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; IC1: [[VECTOR_MEMCHECK]]: +; IC1-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; IC1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; IC1-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; IC1-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; IC1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; IC1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; IC1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; IC1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; IC1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; IC1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IC1-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; IC1-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; IC1-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; IC1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; IC1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; IC1-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; IC1-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; IC1-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; IC1-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; IC1-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]] +; IC1-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]]) +; IC1-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0 +; IC1-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1 +; IC1-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2 +; IC1-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3 +; IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]]) +; IC1-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; IC1-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; +; CHECK-LABEL: define void @test_masked_interleave_group( +; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ] + %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ] + %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ] + %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1 + %mask.val = load i8, ptr %mask.iv, align 1 + %should.copy = icmp eq i8 %mask.val, 0 + br i1 %should.copy, label %then, label %loop.latch + +then: + %elem0 = load float, ptr %src.iv, align 4 + store float %elem0, ptr %dst.iv, align 4 + %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4 + %s1 = load float, ptr %src.1.ptr, align 4 + %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4 + store float %s1, ptr %dst.1.ptr, align 4 + %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8 + %s2 = load float, ptr %src.2.ptr, align 4 + %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8 + store float %s2, ptr %dst.2.ptr, align 4 + %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12 + %s3 = load float, ptr %src.3.ptr, align 4 + %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12 + store float %s3, ptr %dst.3.ptr, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add i32 %iv, 1 + %src.iv.next = getelementptr i8, ptr %src.iv, i64 16 + %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16 + %ec = icmp eq i32 %iv, %N + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index d290f2d4f5bc..abfb44dcbd36 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -60,32 +60,26 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3> -; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 ; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]] ; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll index 75980ba1189c..f2e689c0b419 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll @@ -328,10 +328,8 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) { ; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VF2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 ; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]] -; VF2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> -; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8 +; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 ; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VF2: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll index 44a48a9c262f..0f398a69a4bc 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -84,15 +84,14 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK: We can vectorize this loop! define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader + br label %for.body.preheader for.body.preheader: ; preds = %entry br label %for.body for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07 %0 = load i32, ptr %arrayidx, align 4 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07 @@ -107,9 +106,8 @@ for.end.loopexit: ; preds = %for.body %add.lcssa = phi i32 [ %add, %for.body ] br label %for.end -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret i32 %Red.0.lcssa +for.end: ; preds = %for.end.loopexit + ret i32 %add.lcssa } ; Floating-point loops need fast-math to be vectorizeable @@ -121,15 +119,14 @@ for.end: ; preds = %for.end.loopexit, % ; DARWIN: We can vectorize this loop! define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader + br label %for.body.preheader for.body.preheader: ; preds = %entry br label %for.body for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ] %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07 %0 = load float, ptr %arrayidx, align 4 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07 @@ -144,9 +141,8 @@ for.end.loopexit: ; preds = %for.body %add.lcssa = phi float [ %add, %for.body ] br label %for.end -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret float %Red.0.lcssa +for.end: ; preds = %for.end.loopexit + ret float %add.lcssa } ; Make sure calls that turn into builtins are also covered @@ -252,7 +248,7 @@ for.body.preheader: ; preds = %entry for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07 %0 = load i32, ptr %arrayidx, align 4 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07 @@ -268,7 +264,7 @@ for.end.loopexit: ; preds = %for.body br label %for.end for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + %Red.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ] ret i32 %Red.0.lcssa } @@ -277,15 +273,14 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK: We can vectorize this loop! define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader + br label %for.body.preheader for.body.preheader: ; preds = %entry br label %for.body for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ] %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07 %0 = load float, ptr %arrayidx, align 4 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07 @@ -300,9 +295,8 @@ for.end.loopexit: ; preds = %for.body %add.lcssa = phi float [ %add, %for.body ] br label %for.end -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret float %Red.0.lcssa +for.end: ; preds = %for.end.loopexit + ret float %add.lcssa } ; Make sure calls that turn into builtins are also covered diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll index fe3504bc4b67..23609b1041ae 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll @@ -249,7 +249,7 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 { br label %7 7: ; preds = %5, %155 - %8 = phi i32 [ %10, %155 ], [ undef, %5 ] + %8 = phi i32 [ %10, %155 ], [ 0, %5 ] %9 = phi i32 [ %156, %155 ], [ 0, %5 ] %10 = shl i32 %8, 4 store i32 %10, ptr %6, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll index c8d20dccbb32..e42e2c73d588 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll @@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi" %struct.TwoInts = type { i32, i32 } %struct.ThreeInts = type { i32, i32, i32 } %struct.FourInts = type { i32, i32, i32, i32 } +%struct.TwoShorts = type { i16, i16 } %struct.ThreeShorts = type { i16, i16, i16 } %struct.FourShorts = type { i16, i16, i16, i16 } %struct.TwoBytes = type { i8, i8 } @@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi" %struct.FourBytes = type { i8, i8, i8, i8 } %struct.FiveBytes = type { i8, i8, i8, i8, i8 } %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } +%struct.TwoFloats = type { float, float } +%struct.FourFloats = type { float, float, float, float } ; CHECK-LABEL: two_ints_same_op ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10 @@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, 34: ; preds = %6, %4 ret void } + +; CHECK-LABEL: two_floats_same_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_vary_op +; CHECK: LV: Scalar loop costs: 14 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 14. +; CHECK: LV: Vector loop of width 2 costs: 19. +; CHECK: LV: Vector loop of width 4 costs: 15. +; CHECK: LV: Selecting VF: 1. +define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %inc = add nuw i32 %i.021, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_bytes_two_floats_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 23 +; CHECK: LV: Vector loop of width 4 costs: 13 +; CHECK: LV: Selecting VF: 4. +define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_bytes_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 21 +; CHECK: LV: Vector loop of width 4 costs: 14. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_same_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp24.not = icmp eq i32 %N, 0 + br i1 %cmp24.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %inc = add nuw i32 %i.025, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_shorts_two_floats_vary_op +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 18 +; CHECK: LV: Vector loop of width 2 costs: 22 +; CHECK: LV: Vector loop of width 4 costs: 11. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp23.not = icmp eq i32 %N, 0 + br i1 %cmp23.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %add = fadd float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024 + store float %add, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %sub = fsub float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %sub, ptr %y12, align 4 + %inc = add nuw i32 %i.024, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_same_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp22.not = icmp eq i32 %N, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %inc = add nuw i32 %i.023, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: two_floats_two_shorts_vary_op +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2 +; CHECK: LV: Scalar loop costs: 16 +; CHECK: LV: Vector loop of width 2 costs: 20 +; CHECK: LV: Vector loop of width 4 costs: 13. +; CHECK: LV: Selecting VF: 4. +define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp21.not = icmp eq i32 %N, 0 + br i1 %cmp21.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %conv = fptosi float %add to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %conv8 = fptosi float %sub to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %inc = add nuw i32 %i.022, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046 + store float %mul, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %mul8, ptr %y10, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %mul14 = fmul float %4, %5 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul14, ptr %z16, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %mul20 = fmul float %6, %7 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %mul20, ptr %w22, align 4 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 24 +; CHECK: LV: Vector loop of width 2 costs: 33 +; CHECK: LV: Vector loop of width 4 costs: 30 +; CHECK: LV: Selecting VF: 1 +define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp42.not = icmp eq i32 %N, 0 + br i1 %cmp42.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043 + store float %add, ptr %arrayidx3, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %sub = fsub float %2, %3 + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store float %sub, ptr %y9, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z12, align 4 + %mul = fmul float %4, %5 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8 + store float %mul, ptr %z14, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w17, align 4 + %div = fdiv float %6, %7 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12 + store float %div, ptr %w19, align 4 + %inc = add nuw i32 %i.043, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv15 = sitofp i8 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z17, align 1 + %conv18 = sitofp i8 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv23 = sitofp i8 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w25, align 1 + %conv26 = sitofp i8 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_bytes_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 43 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sitofp i8 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050 + %1 = load i8, ptr %arrayidx1, align 1 + %conv3 = sitofp i8 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1 + %2 = load i8, ptr %y, align 1 + %conv7 = sitofp i8 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1 + %3 = load i8, ptr %y9, align 1 + %conv10 = sitofp i8 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %4 = load i8, ptr %z, align 1 + %conv14 = sitofp i8 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %5 = load i8, ptr %z16, align 1 + %conv17 = sitofp i8 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3 + %6 = load i8, ptr %w, align 1 + %conv21 = sitofp i8 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3 + %7 = load i8, ptr %w23, align 1 + %conv24 = sitofp i8 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i8 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv9, ptr %y11, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i8 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv16, ptr %z18, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i8 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv23, ptr %w25, align 1 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_bytes_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 38 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i8 + %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046 + store i8 %conv, ptr %arrayidx3, align 1 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i8 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1 + store i8 %conv8, ptr %y10, align 1 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i8 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i8 %conv14, ptr %z16, align 1 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i8 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3 + store i8 %conv20, ptr %w22, align 1 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp52.not = icmp eq i32 %N, 0 + br i1 %cmp52.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %mul11 = fmul float %conv7, %conv10 + %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %mul11, ptr %y13, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv15 = sitofp i16 %4 to float + %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z17, align 2 + %conv18 = sitofp i16 %5 to float + %mul19 = fmul float %conv15, %conv18 + %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %mul19, ptr %z21, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv23 = sitofp i16 %6 to float + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w25, align 2 + %conv26 = sitofp i16 %7 to float + %mul27 = fmul float %conv23, %conv26 + %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %mul27, ptr %w29, align 4 + %inc = add nuw i32 %i.053, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_shorts_four_floats_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 32 +; CHECK: LV: Vector loop of width 2 costs: 37 +; CHECK: LV: Vector loop of width 4 costs: 23 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp49.not = icmp eq i32 %N, 0 + br i1 %cmp49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sitofp i16 %0 to float + %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050 + %1 = load i16, ptr %arrayidx1, align 2 + %conv3 = sitofp i16 %1 to float + %mul = fmul float %conv, %conv3 + %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050 + store float %mul, ptr %arrayidx4, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2 + %2 = load i16, ptr %y, align 2 + %conv7 = sitofp i16 %2 to float + %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2 + %3 = load i16, ptr %y9, align 2 + %conv10 = sitofp i16 %3 to float + %add = fadd float %conv7, %conv10 + %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4 + store float %add, ptr %y12, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %4 = load i16, ptr %z, align 2 + %conv14 = sitofp i16 %4 to float + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %5 = load i16, ptr %z16, align 2 + %conv17 = sitofp i16 %5 to float + %div = fdiv float %conv14, %conv17 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8 + store float %div, ptr %z19, align 4 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6 + %6 = load i16, ptr %w, align 2 + %conv21 = sitofp i16 %6 to float + %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6 + %7 = load i16, ptr %w23, align 2 + %conv24 = sitofp i16 %7 to float + %sub = fsub float %conv21, %conv24 + %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12 + store float %sub, ptr %w26, align 4 + %inc = add nuw i32 %i.050, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_same_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp48.not = icmp eq i32 %N, 0 + br i1 %cmp48.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %mul8 = fmul float %2, %3 + %conv9 = fptosi float %mul8 to i16 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv9, ptr %y11, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z14, align 4 + %mul15 = fmul float %4, %5 + %conv16 = fptosi float %mul15 to i16 + %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv16, ptr %z18, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w21, align 4 + %mul22 = fmul float %6, %7 + %conv23 = fptosi float %mul22 to i16 + %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv23, ptr %w25, align 2 + %inc = add nuw i32 %i.049, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: four_floats_four_shorts_vary_op +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 +; CHECK: LV: Scalar loop costs: 28 +; CHECK: LV: Vector loop of width 2 costs: 35 +; CHECK: LV: Vector loop of width 4 costs: 26 +; CHECK: LV: Selecting VF: 4 +define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { +entry: + %cmp45.not = icmp eq i32 %N, 0 + br i1 %cmp45.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046 + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul float %0, %1 + %conv = fptosi float %mul to i16 + %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046 + store i16 %conv, ptr %arrayidx3, align 2 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4 + %2 = load float, ptr %y, align 4 + %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4 + %3 = load float, ptr %y7, align 4 + %add = fadd float %2, %3 + %conv8 = fptosi float %add to i16 + %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2 + store i16 %conv8, ptr %y10, align 2 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8 + %4 = load float, ptr %z, align 4 + %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8 + %5 = load float, ptr %z13, align 4 + %div = fdiv float %4, %5 + %conv14 = fptosi float %div to i16 + %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4 + store i16 %conv14, ptr %z16, align 2 + %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12 + %6 = load float, ptr %w, align 4 + %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12 + %7 = load float, ptr %w19, align 4 + %sub = fsub float %6, %7 + %conv20 = fptosi float %sub to i16 + %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6 + store i16 %conv20, ptr %w22, align 2 + %inc = add nuw i32 %i.046, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll index 2ef9d4b40d9a..3718ad23c061 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll @@ -14,7 +14,7 @@ define x86_fp80 @test() { ; CHECK-NEXT: br label [[FOR_BODY3_I_3:%.*]] ; CHECK: for.body3.i.3: ; CHECK-NEXT: [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ] -; CHECK-NEXT: [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ] +; CHECK-NEXT: [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 0xK00000000000000000000, [[FOO_EXIT]] ] ; CHECK-NEXT: [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000 ; CHECK-NEXT: [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1 ; CHECK-NEXT: [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1 @@ -28,7 +28,7 @@ foo.exit: for.body3.i.3: ; preds = %for.body3.i.3, %foo.exit %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ] - %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ] + %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ zeroinitializer, %foo.exit ] %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000 %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1 %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll index df1c4f979986..5321d69a9f5f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll @@ -21,10 +21,10 @@ while.cond63.preheader.while.end76_crit_edge: ret void while.body: - %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ] - %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ] - %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ] - %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ] + %d2_fx.015 = phi double [ %sub52, %while.body ], [ 0.0e+00, %entry ] + %d2_fy.014 = phi double [ %sub58, %while.body ], [ 0.0e+00, %entry ] + %d3_fy.013 = phi double [ %div56, %while.body ], [ 0.0e+00, %entry ] + %d3_fx.012 = phi double [ %div50, %while.body ], [ 0.0e+00, %entry ] %div50 = fmul double %d3_fx.012, 1.250000e-01 %sub52 = fsub double 0.000000e+00, %div50 %div56 = fmul double %d3_fy.013, 1.250000e-01 diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll index 2a3ce037e956..f4d80af7089b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll @@ -8,15 +8,69 @@ target triple = "x86_64-unknown-linux" define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) { ; CHECK-LABEL: define void @test_4xi64( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ITER_CHECK:.*]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16 +; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]] +; CHECK-NEXT: store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8 +; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8 +; CHECK-NEXT: store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8 +; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8 +; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]] +; CHECK-NEXT: br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] ; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0 @@ -27,15 +81,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]] ; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0 @@ -56,7 +110,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -117,7 +171,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -140,7 +194,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -195,7 +249,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -218,7 +272,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -273,7 +327,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -296,7 +350,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -351,7 +405,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -374,7 +428,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_0]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -435,7 +489,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -459,7 +513,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -519,7 +573,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -544,7 +598,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -653,7 +707,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -677,7 +731,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -711,19 +765,20 @@ exit: ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll index 368361fd760e..0f55d79b2d29 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll @@ -15,7 +15,7 @@ for.cond.cleanup: for.body: %i.09 = phi i16 [ 0, %entry ], [ %add3, %for.body ] - %res.08 = phi x86_fp80 [ undef, %entry ], [ %3, %for.body ] + %res.08 = phi x86_fp80 [ zeroinitializer, %entry ], [ %3, %for.body ] %arrayidx = getelementptr inbounds x86_fp80, ptr %a, i16 %i.09 %0 = load x86_fp80, ptr %arrayidx, align 1 %add = or i16 %i.09, 1 diff --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll index 220fd64e6a82..712c75d3ed04 100644 --- a/llvm/test/Transforms/LoopVectorize/i8-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll @@ -20,7 +20,7 @@ scalar.ph: for.body: %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ] ; <------- i8 induction var. - %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ] + %c.015 = phi i8 [ 0, %scalar.ph ], [ %conv8, %for.body ] %conv2 = sext i8 %c.015 to i32 %tobool = icmp ne i8 %c.015, 0 %.sink = select i1 %tobool, i8 %c.015, i8 %0 diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index f7376a0f8e20..c164c4a46bd9 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -277,7 +277,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { ; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC26]] ; UNROLL-NOSIMPLIFY: for.inc26: -; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ] ; UNROLL-NOSIMPLIFY-NEXT: unreachable ; ; VEC-LABEL: @bug18724( @@ -376,7 +376,7 @@ for.inc23: br i1 %cmp13, label %for.body14, label %for.inc26 for.inc26: - %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ] + %iNewChunks.1.lcssa = phi i32 [ 0, %for.body9 ], [ %iNewChunks.2, %for.inc23 ] unreachable } diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll index 9e750022d8c4..5cf99b899860 100644 --- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll +++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll @@ -53,8 +53,8 @@ thread-pre-split.loopexit: ; preds = %11, %.thread-pre-sp br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21 .lr.ph21: ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader - %d.020 = phi ptr [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ] - %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ] + %d.020 = phi ptr [ zeroinitializer, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ] + %10 = phi i64 [ %28, %26 ], [ zeroinitializer, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ] br i1 %arg, label %11, label %22 ; <label>:11 ; preds = %.lr.ph21 diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 16357b3fbc4a..8efe29a54d91 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -791,8 +791,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> @@ -822,8 +822,8 @@ for.cond.cleanup: ; preds = %for.body for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] - %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0 %tmp = load i32, ptr %a, align 4 %add = add nsw i32 %tmp, %SumA.013 diff --git a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll index 741ca559343d..1675a595072f 100644 --- a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll +++ b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll @@ -48,7 +48,7 @@ for.cond.cleanup.loopexit: br label %for.cond.cleanup, !dbg !33 for.cond.cleanup: - %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33 + %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ 0, %entry ], !dbg !33 %sub = add nsw i32 %0, -5, !dbg !33 %idxprom3 = sext i32 %sub to i64, !dbg !33 %arrayidx4 = getelementptr inbounds i32, ptr %vla, i64 %idxprom3, !dbg !33 diff --git a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll index 659dc62e7ab5..20386334da60 100644 --- a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll @@ -22,8 +22,8 @@ entry: br label %for.body for.body: - %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ] - %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ] + %inc107 = phi i32 [ 0, %entry ], [ %inc10, %for.body ] + %inc6 = phi i32 [ %nf.promoted, %entry ], [ 3, %for.body ] %add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ] %.neg2 = sub i32 0, %inc6 %add.neg = add i32 0, %add55 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index ec7fde81b205..964a257ef352 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1207,11 +1207,11 @@ for.end: define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-LABEL: define i32 @reduction_sum_multiuse( ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) { -; CHECK-NEXT: [[_LR_PH1:.*]]: +; CHECK-NEXT: [[_LR_PH:.*]]: ; CHECK-NEXT: br label %[[DOTLR_PH:.*]] -; CHECK: [[_LR_PH:.*:]] -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ] -; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ] +; CHECK: [[_LR_PH1:.*:]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ] +; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ] ; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[L3:%.*]] = load i32, ptr [[L2]], align 4 ; CHECK-NEXT: [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] @@ -1231,11 +1231,11 @@ define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocaptu ; ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_multiuse( ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) { -; CHECK-INTERLEAVED-NEXT: [[_LR_PH1:.*]]: +; CHECK-INTERLEAVED-NEXT: [[_LR_PH:.*]]: ; CHECK-INTERLEAVED-NEXT: br label %[[DOTLR_PH:.*]] -; CHECK-INTERLEAVED: [[_LR_PH:.*:]] -; CHECK-INTERLEAVED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ] -; CHECK-INTERLEAVED-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ] +; CHECK-INTERLEAVED: [[_LR_PH1:.*:]] +; CHECK-INTERLEAVED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ] ; CHECK-INTERLEAVED-NEXT: [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-INTERLEAVED-NEXT: [[L3:%.*]] = load i32, ptr [[L2]], align 4 ; CHECK-INTERLEAVED-NEXT: [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] @@ -1947,7 +1947,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer @@ -1966,7 +1966,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY2:.*]] ; CHECK: [[FOR_BODY2]]: ; CHECK-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -2002,7 +2002,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h ; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-INTERLEAVED: [[VECTOR_BODY]]: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4 @@ -2033,7 +2033,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]] ; CHECK-INTERLEAVED: [[SCALAR_PH]]: ; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY2:.*]] ; CHECK-INTERLEAVED: [[FOR_BODY2]]: ; CHECK-INTERLEAVED-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -2063,7 +2063,7 @@ entry: for.body2: ; preds = %entry, %for.inc5 %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ] - %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ] + %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ] %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117 %0 = load i8, ptr %arrayidx, align 1 %tobool3.not = icmp eq i8 %0, 0 @@ -2100,7 +2100,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer @@ -2122,7 +2122,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY2:.*]] ; CHECK: [[FOR_BODY2]]: ; CHECK-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -2159,7 +2159,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-INTERLEAVED: [[VECTOR_BODY]]: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4 @@ -2196,7 +2196,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]] ; CHECK-INTERLEAVED: [[SCALAR_PH]]: ; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY2:.*]] ; CHECK-INTERLEAVED: [[FOR_BODY2]]: ; CHECK-INTERLEAVED-NEXT: [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -2227,7 +2227,7 @@ entry: for.body2: ; preds = %entry, %for.inc5 %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ] - %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ] + %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ] %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117 %0 = load i8, ptr %arrayidx, align 1 %tobool3.not = icmp eq i8 %0, 0 @@ -2263,7 +2263,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 @@ -2340,7 +2340,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) { ; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-INTERLEAVED: [[VECTOR_BODY]]: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE15:.*]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP98:%.*]], %[[PRED_LOAD_CONTINUE15]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 @@ -2480,7 +2480,7 @@ for.cond.cleanup: ; preds = %for.inc for.body: ; preds = %entry, %for.inc %g.09 = phi i32 [ 0, %entry ], [ %inc3, %for.inc ] - %a.08 = phi i32 [ undef, %entry ], [ %a.1, %for.inc ] + %a.08 = phi i32 [ 0, %entry ], [ %a.1, %for.inc ] %d = getelementptr inbounds [0 x %struct.e], ptr %b, i32 0, i32 %g.09, i32 1 %0 = load i32, ptr %d, align 4 %tobool.not = icmp eq i32 %0, 0 diff --git a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll index 43e916b48265..6576675a913c 100644 --- a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll +++ b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll @@ -17,8 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK: <i32 0, i32 -1, i32 -2, i32 -3> ;CHECK: ret define i32 @foo(i32 %n, ptr nocapture %A) { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge + br label %.lr.ph .lr.ph: ; preds = %0 %2 = sext i32 %n to i64 @@ -26,7 +25,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) { ; <label>:3 ; preds = %.lr.ph, %3 %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ] - %sum.01 = phi i32 [ undef, %.lr.ph ], [ %9, %3 ] + %sum.01 = phi i32 [ 0, %.lr.ph ], [ %9, %3 ] %4 = trunc i64 %indvars.iv to i32 %5 = shl nsw i32 %4, 1 %6 = sext i32 %5 to i64 @@ -38,8 +37,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) { %11 = icmp sgt i32 %10, 0 br i1 %11, label %3, label %._crit_edge -._crit_edge: ; preds = %3, %0 - %sum.0.lcssa = phi i32 [ undef, %0 ], [ %9, %3 ] - ret i32 %sum.0.lcssa +._crit_edge: ; preds = %3 + ret i32 %9 } diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll index 60da3368b664..1216bc1dc33c 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll @@ -178,13 +178,14 @@ for.exit: define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF1-LABEL: define i32 @recurrence_2( ; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) { -; CHECK-VF4UF1-NEXT: [[ENTRY:.*]]: -; CHECK-VF4UF1-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[N]], 0 -; CHECK-VF4UF1-NEXT: br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK-VF4UF1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4UF1-NEXT: br label %[[FOR_PREHEADER:.*]] ; CHECK-VF4UF1: [[FOR_PREHEADER]]: ; CHECK-VF4UF1-NEXT: [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1 ; CHECK-VF4UF1-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4 -; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4UF1-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1 +; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 ; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] @@ -202,7 +203,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF1: [[VECTOR_BODY]]: ; CHECK-VF4UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] -; CHECK-VF4UF1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4UF1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-VF4UF1-NEXT: [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4 ; CHECK-VF4UF1-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1) @@ -225,25 +226,25 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF1: [[SCALAR_PH]]: ; CHECK-VF4UF1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ] ; CHECK-VF4UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] -; CHECK-VF4UF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ] +; CHECK-VF4UF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] ; CHECK-VF4UF1-NEXT: br label %[[SCALAR_BODY:.*]] ; CHECK-VF4UF1: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; CHECK-VF4UF1-NEXT: [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ] -; CHECK-VF4UF1-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK-VF4UF1-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4UF1-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; CHECK-VF4UF1: [[FOR_COND_CLEANUP]]: -; CHECK-VF4UF1-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] ; CHECK-VF4UF1-NEXT: ret i32 [[MINMAX_0_LCSSA]] ; CHECK-VF4UF1: [[SCALAR_BODY]]: ; ; CHECK-VF4UF2-LABEL: define i32 @recurrence_2( ; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) { -; CHECK-VF4UF2-NEXT: [[ENTRY:.*]]: -; CHECK-VF4UF2-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[N]], 0 -; CHECK-VF4UF2-NEXT: br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK-VF4UF2-NEXT: [[ENTRY:.*:]] +; CHECK-VF4UF2-NEXT: br label %[[FOR_PREHEADER:.*]] ; CHECK-VF4UF2: [[FOR_PREHEADER]]: ; CHECK-VF4UF2-NEXT: [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1 ; CHECK-VF4UF2-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4 -; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4UF2-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1 +; CHECK-VF4UF2-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3 ; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] @@ -261,8 +262,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF2: [[VECTOR_BODY]]: ; CHECK-VF4UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ] -; CHECK-VF4UF2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ] -; CHECK-VF4UF2-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4UF2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4UF2-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF4UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] ; CHECK-VF4UF2-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF4UF2-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2 @@ -296,19 +297,17 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF2: [[SCALAR_PH]]: ; CHECK-VF4UF2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ] ; CHECK-VF4UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] -; CHECK-VF4UF2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ] +; CHECK-VF4UF2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] ; CHECK-VF4UF2-NEXT: br label %[[SCALAR_BODY:.*]] ; CHECK-VF4UF2: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; CHECK-VF4UF2-NEXT: [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ] -; CHECK-VF4UF2-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK-VF4UF2-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4UF2-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; CHECK-VF4UF2: [[FOR_COND_CLEANUP]]: -; CHECK-VF4UF2-NEXT: [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] ; CHECK-VF4UF2-NEXT: ret i32 [[MINMAX_0_LCSSA]] ; CHECK-VF4UF2: [[SCALAR_BODY]]: ; entry: - %cmp27 = icmp sgt i32 %n, 0 - br i1 %cmp27, label %for.preheader, label %for.cond.cleanup + br label %for.preheader for.preheader: %arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 -1 @@ -320,13 +319,12 @@ for.cond.cleanup.loopexit: br label %for.cond.cleanup for.cond.cleanup: - %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %minmax.0.lcssa + ret i32 %minmax.0.cond.lcssa scalar.body: %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ] %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] - %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ] + %minmax.028 = phi i32 [ 0, %for.preheader ], [ %minmax.0.cond, %scalar.body ] %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv %1 = load i32, ptr %arrayidx, align 4 %sub3 = sub nsw i32 %1, %0 diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll index 58ad64df1cbe..8224d6b5df9f 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll @@ -16,7 +16,7 @@ entry: br label %for.cond for.cond: ; preds = %for.cond, %entry - %i.0 = phi i32 [ undef, %entry ], [ %inc, %for.cond ] + %i.0 = phi i32 [ poison, %entry ], [ %inc, %for.cond ] %cmp = icmp slt i32 %i.0, 0 %fsub = fsub double undef, undef %fadd = fadd double %fsub, 1.000000e+00 @@ -36,7 +36,7 @@ for.cond7.preheader.lr.ph: ; preds = %for.cond4.preheader for.cond7.preheader: ; preds = %for.cond7.preheader.lr.ph, %for.inc23 %y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ] %i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ] - %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ] + %n.015 = phi i32 [ poison, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ] %1 = load i32, ptr @b, align 4, !tbaa !5 %tobool11 = icmp eq i32 %1, 0 br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll index f4d5a84fe67c..cd9eb7208964 100644 --- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll @@ -23,7 +23,7 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) { ; CHECK: middle.block: ; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]]) ; CHECK-NEXT: [[TMP5:%.*]] = freeze i1 [[TMP4]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i64 [[RDX_SELECT]] @@ -33,7 +33,7 @@ entry: loop: %iv = phi i32 [ 1, %entry ], [ %add, %loop ] - %red = phi i64 [ undef, %entry ], [ %select, %loop ] + %red = phi i64 [ poison, %entry ], [ %select, %loop ] %gep = getelementptr inbounds i32, ptr %src, i32 %iv %l = load i32, ptr %gep %c = icmp eq i32 %l, 1 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index 358f1b0d108c..71311db33cf1 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -102,7 +102,7 @@ define void @blend_chain_iv(i1 %c) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2 @@ -139,11 +139,11 @@ loop.next.2: br label %loop.next.3 loop.next.3: - %blend.1 = phi i64 [ undef, %loop.next ], [ %iv, %loop.next.2 ] + %blend.1 = phi i64 [ poison, %loop.next ], [ %iv, %loop.next.2 ] br label %loop.latch loop.latch: ; preds = %loop.next, %loop.header - %blend = phi i64 [ undef, %loop.header ], [ %blend.1, %loop.next.3 ] + %blend = phi i64 [ poison, %loop.header ], [ %blend.1, %loop.next.3 ] %dst.ptr = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 %blend store i16 0, ptr %dst.ptr %iv.next = add nuw nsw i64 %iv, 1 diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll new file mode 100644 index 000000000000..01934b1d7fbd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll @@ -0,0 +1,30 @@ +; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified interleave count is ignored. + +; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop. +; CHECK-LABEL: @loop_distance_4 +define void @loop_distance_4(ptr %a, ptr %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ] + %0 = getelementptr i32, ptr %b, i64 %iv + %arrayidx = getelementptr i8, ptr %0, i64 -16 + %1 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %2 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %2, %1 + store i32 %add, ptr %0, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 64 + br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1 + +for.end: + ret void +} + +!1 = !{!1, !2, !3} +!2 = !{!"llvm.loop.interleave.count", i32 4} +!3 = !{!"llvm.loop.vectorize.width", i32 4} |
