diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
65 files changed, 4204 insertions, 541 deletions
diff --git a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll index 67fd59ed4c26..ed3dcad227bc 100644 --- a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll +++ b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll @@ -30,9 +30,9 @@ entry: ret float %conv } -define void @PR17495() { +define void @PR17495(i1 %arg) { entry: - br i1 undef, label %while.end, label %while.body + br i1 %arg, label %while.end, label %while.body while.body: ; preds = %while.body, %entry %x.1.copyload = load i24, ptr undef, align 1 diff --git a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll index 22bf4581c6b4..49de5091f0e5 100644 --- a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll +++ b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll @@ -165,7 +165,7 @@ failure: ; preds = %backedge unreachable } -define void @test_04() { +define void @test_04(i32 %arg) { ; CHECK-LABEL: test_04: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: ud2 @@ -175,7 +175,7 @@ bb: bb1: ; preds = %bb10, %bb %tmp = phi i64 [ 1, %bb ], [ %tmp2, %bb10 ] %tmp2 = add nuw nsw i64 %tmp, 1 - br i1 undef, label %bb21, label %bb7 + br i1 poison, label %bb21, label %bb7 bb7: ; preds = %bb1 %tmp8 = add nsw i64 %tmp, -1 @@ -187,7 +187,7 @@ bb10: ; preds = %bb16 br label %bb1 bb11: ; preds = %bb16, %bb7 - switch i32 undef, label %bb19 [ + switch i32 %arg, label %bb19 [ i32 0, label %bb17 i32 1, label %bb16 i32 2, label %bb15 @@ -205,7 +205,7 @@ bb15: ; preds = %bb11 unreachable bb16: ; preds = %bb11 - br i1 undef, label %bb10, label %bb11 + br i1 poison, label %bb10, label %bb11 bb17: ; preds = %bb11 unreachable diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll index 86874b14b361..faa119cd037f 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll @@ -5,13 +5,13 @@ define void @undef_2phi(ptr%buf) { ; CHECK-LABEL: @undef_2phi( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: br i1 undef, label [[L2]], label [[L3:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[L3:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] -; CHECK-NEXT: br i1 undef, label [[L3]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L3]], label [[EXIT:%.*]] ; CHECK: l3: ; CHECK-NEXT: [[TMP2:%.*]] = phi x86_amx [ [[TMP1]], [[L2]] ], [ [[T1]], [[L1]] ] ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP2]]) @@ -20,16 +20,16 @@ define void @undef_2phi(ptr%buf) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %l3 + br i1 poison, label %l2, label %l3 l2: - %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] - br i1 undef, label %l3, label %exit + %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ] + br i1 poison, label %l3, label %exit l3: %t4 = phi <256 x i32> [ %t3, %l2], [ %t2, %l1 ] @@ -45,10 +45,10 @@ define void @foo_undef(ptr%buf) { ; CHECK-LABEL: @foo_undef( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]]) @@ -57,15 +57,15 @@ define void @foo_undef(ptr%buf) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 poison, label %l2, label %exit l2: - %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] + %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ] %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t4) br label %exit @@ -78,10 +78,10 @@ define void @foo_zero(ptr%buf) { ; CHECK-LABEL: @foo_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]]) @@ -90,12 +90,12 @@ define void @foo_zero(ptr%buf) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 poison, label %l2, label %exit l2: %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ] @@ -112,14 +112,14 @@ define void @foo_vrow(ptr%buf, i16 %row) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW:%.*]], i16 32) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[TMP1]], i64 32, x86_amx [[T1]]) ; CHECK-NEXT: [[TMP3:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024 -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: -; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ] +; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ] ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 32, ptr [[TMP0]], i64 32) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP5]]) @@ -128,15 +128,15 @@ define void @foo_vrow(ptr%buf, i16 %row) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 poison, label %l2, label %exit l2: - %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] + %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ] %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) call void @llvm.x86.tilestored64.internal(i16 %row, i16 32, ptr %buf, i64 1024, x86_amx %t4) br label %exit @@ -150,13 +150,13 @@ define void @foo_vcol(ptr%buf, i16 %col) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 [[COL:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL]] to i64 ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[TMP1]], i64 [[TMP3]], x86_amx [[T1]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024 -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP4]], [[L1]] ] ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[TMP0]], align 1024 @@ -168,12 +168,12 @@ define void @foo_vcol(ptr%buf, i16 %col) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 %col) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 poison, label %l2, label %exit l2: %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ] @@ -189,29 +189,29 @@ define void @noshape(ptr%buf) { ; CHECK-LABEL: @noshape( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: -; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ] +; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ] ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 poison, label %l2, label %exit l2: - %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] + %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ] %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4) store <256 x i32> %t5, ptr %buf @@ -225,14 +225,14 @@ define void @noshape2(ptr%buf) { ; CHECK-LABEL: @noshape2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 poison, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 poison, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: -; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ] +; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ] ; CHECK-NEXT: [[T6:%.*]] = call <256 x i32> @llvm.abs.v256i32(<256 x i32> [[T3]], i1 true) ; CHECK-NEXT: store <256 x i32> [[T6]], ptr [[BUF:%.*]], align 1024 ; CHECK-NEXT: br label [[EXIT]] @@ -240,15 +240,15 @@ define void @noshape2(ptr%buf) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 poison, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 poison, label %l2, label %exit l2: - %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] + %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ] %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4) %t6 = call <256 x i32> @llvm.abs.v256i32(<256 x i32> %t5, i1 1) diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll index b2eb5fd915b9..b70668f7a3de 100644 --- a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll @@ -18,14 +18,14 @@ wrapper_entry: ; Cases where amxcast can be combined across bb ; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast -define void @combine_amx_cast_and_phi() { +define void @combine_amx_cast_and_phi(i1 %arg) { ; CHECK-LABEL: @combine_amx_cast_and_phi( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) @@ -43,7 +43,7 @@ define void @combine_amx_cast_and_phi() { wrapper_entry: %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) - br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) @@ -62,7 +62,7 @@ for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, ; Cases where amxcast can't be combined across bb ; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast -define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) { +define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp, i1 %arg) { ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 @@ -71,7 +71,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) { ; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64 ; CHECK-NEXT: [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]] -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP4]], align 512 ; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40) @@ -92,7 +92,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) { ; wrapper_entry: %0 = add <110 x i32> %tmp, %tmp - br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) @@ -111,7 +111,7 @@ for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, ; Cases where amxcast can't be combined across bb ; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web. -define void @fail_to_combine_amx_cast_and_phi2() { +define void @fail_to_combine_amx_cast_and_phi2(i1 %arg) { ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 @@ -123,7 +123,7 @@ define void @fail_to_combine_amx_cast_and_phi2() { ; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, x86_amx [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = load <110 x i32>, ptr [[TMP5]], align 512 -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP4]], align 512 ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40) @@ -134,13 +134,13 @@ define void @fail_to_combine_amx_cast_and_phi2() { ; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]]) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP15]]) ; CHECK-NEXT: [[TMP17:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512 -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] ; CHECK: for.cond.cleanup.i.i: ; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ] ; CHECK-NEXT: store <110 x i32> [[GOODPHI]], ptr [[TMP0]], align 512 ; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP19]]) -; CHECK-NEXT: br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] ; CHECK: exit: ; CHECK-NEXT: [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ] ; CHECK-NEXT: store <110 x i32> [[EVILPHI2]], ptr undef, align 512 @@ -149,7 +149,7 @@ define void @fail_to_combine_amx_cast_and_phi2() { wrapper_entry: %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) - br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) @@ -157,27 +157,27 @@ for.body.i.lr.ph.i: ; preds = %wrapper_entry %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) - br i1 undef, label %for.cond.cleanup.i.i, label %exit + br i1 %arg, label %for.cond.cleanup.i.i, label %exit for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) - br i1 undef, label %exit, label %for.body.i.lr.ph.i + br i1 %arg, label %exit, label %for.body.i.lr.ph.i exit: %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ] store <110 x i32> %evilphi2, ptr undef, align 512 ret void } -define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() { +define void @fail_to_combine_amx_cast_and_phi_due_to_const_value(i1 %arg) { ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 11, i16 40) -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) @@ -193,7 +193,7 @@ define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() { ; CHECK-NEXT: ret void ; wrapper_entry: - br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) @@ -213,14 +213,14 @@ for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, ; Cases where amxcast can be combined across bb ; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop ; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue. -define void @combine_amx_cast_and_multiple_phi() { +define void @combine_amx_cast_and_multiple_phi(i1 %arg) { ; CHECK-LABEL: @combine_amx_cast_and_multiple_phi( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512 ; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) @@ -229,11 +229,11 @@ define void @combine_amx_cast_and_multiple_phi() { ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) ; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]]) -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] ; CHECK: for.cond.cleanup.i.i: ; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]]) -; CHECK-NEXT: br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] ; CHECK: exit: ; CHECK-NEXT: [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP12]]) @@ -242,7 +242,7 @@ define void @combine_amx_cast_and_multiple_phi() { wrapper_entry: %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) - br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) @@ -250,13 +250,13 @@ for.body.i.lr.ph.i: ; preds = %wrapper_entry %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) - br i1 undef, label %for.cond.cleanup.i.i, label %exit + br i1 %arg, label %for.cond.cleanup.i.i, label %exit for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) - br i1 undef, label %exit, label %for.body.i.lr.ph.i + br i1 %arg, label %exit, label %for.body.i.lr.ph.i exit: %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ] %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) @@ -265,7 +265,7 @@ exit: } ; Currently we are not able to delete DeadPHICycle, later we will handle with them -define void @combine_amx_cast_and_phi_in_a_circle() { +define void @combine_amx_cast_and_phi_in_a_circle(i1 %arg) { ; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 @@ -284,7 +284,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() { ; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]]) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, x86_amx [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = load <110 x i32>, ptr [[TMP0]], align 512 -; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ] ; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ] @@ -294,7 +294,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() { ; CHECK-NEXT: [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ] ; CHECK-NEXT: [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ] ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]]) -; CHECK-NEXT: br i1 undef, label [[BB2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[BB2]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]]) ; CHECK-NEXT: ret void @@ -310,7 +310,7 @@ bb1: ; preds = %wrapper_entry %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) - br i1 undef, label %bb2, label %bb3 + br i1 %arg, label %bb2, label %bb3 bb2: ; preds = %bb1, %wrapper_entry %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ] @@ -321,19 +321,19 @@ bb3: %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ] %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7) - br i1 undef, label %bb2, label %exit + br i1 %arg, label %bb2, label %exit exit: %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8) ret void } -define void @eliminate_unused_phi_and_cast() { +define void @eliminate_unused_phi_and_cast(i1 %arg) { ; CHECK-LABEL: @eliminate_unused_phi_and_cast( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) -; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef) ; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef) @@ -349,7 +349,7 @@ define void @eliminate_unused_phi_and_cast() { wrapper_entry: %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) - br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef) diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll index 391727d54a03..3a5b424540ff 100644 --- a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll @@ -317,16 +317,16 @@ define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byva ret void } -define void @dead_code(ptr%buf) { +define void @dead_code(ptr%buf, i1 %arg) { ; CHECK-LABEL: @dead_code( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: ; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) ; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 -; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ] ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024 @@ -335,12 +335,12 @@ define void @dead_code(ptr%buf) { ; CHECK-NEXT: ret void ; entry: - br i1 undef, label %l1, label %l2 + br i1 %arg, label %l1, label %l2 l1: %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) - br i1 undef, label %l2, label %exit + br i1 %arg, label %l2, label %exit l2: %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll index 389d024dafd1..db3e7dcdfe2d 100644 --- a/llvm/test/CodeGen/X86/StackColoring.ll +++ b/llvm/test/CodeGen/X86/StackColoring.ll @@ -135,7 +135,7 @@ entry: %t3 = call i32 @foo(i32 %in, ptr %a3) %t4 = call i32 @foo(i32 %in, ptr %a3) call void @llvm.lifetime.end.p0(i64 -1, ptr %a3) - br i1 undef, label %bb2, label %bb3 + br i1 poison, label %bb2, label %bb3 bb2: call void @llvm.lifetime.start.p0(i64 -1, ptr %a4) %t11 = call i32 @foo(i32 %in, ptr %a4) diff --git a/llvm/test/CodeGen/X86/asm-label.ll b/llvm/test/CodeGen/X86/asm-label.ll index 05c37db532f8..2d3e7b624d35 100644 --- a/llvm/test/CodeGen/X86/asm-label.ll +++ b/llvm/test/CodeGen/X86/asm-label.ll @@ -12,15 +12,15 @@ ; SAVETEMP: jne {{.*}} <.LBB0_1> ; SAVETEMP-LABEL: <.LBB0_1>: -define void @foo() { +define void @foo(i1 %arg, i32 %arg2) { entry: - br i1 undef, label %land.lhs.true, label %if.end11 + br i1 %arg, label %land.lhs.true, label %if.end11 land.lhs.true: ; preds = %entry - br i1 undef, label %if.then, label %if.end11 + br i1 %arg, label %if.then, label %if.end11 if.then: ; preds = %land.lhs.true - br i1 undef, label %if.then9, label %if.end + br i1 %arg, label %if.then9, label %if.end if.then9: ; preds = %if.then br label %cleanup @@ -29,7 +29,7 @@ if.end: ; preds = %if.then br label %cleanup cleanup: ; preds = %if.end, %if.then9 - switch i32 undef, label %default [ + switch i32 %arg2, label %default [ i32 0, label %cleanup.cont i32 1, label %if.end11 ] diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll index 7a33daf18be8..1b688c8cf9cc 100644 --- a/llvm/test/CodeGen/X86/avx-select.ll +++ b/llvm/test/CodeGen/X86/avx-select.ll @@ -84,7 +84,7 @@ head: %isneg = icmp slt <4 x i32> %v3, zeroinitializer %or0 = select <4 x i1> %isneg, <4 x i32> <i32 26146, i32 -1257, i32 -2, i32 -3052>, <4 x i32> <i32 -24947, i32 7802, i32 29242, i32 15858> %or1 = shufflevector <4 x i32> %or0, <4 x i32> <i32 29361, i32 -16094, i32 -3080, i32 -26286>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - br i1 undef, label %exit, label %head + br i1 poison, label %exit, label %head exit: store <8 x i32> %or1, ptr addrspace(1) undef, align 32 diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll new file mode 100644 index 000000000000..d7ad7b048c6d --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64 + +; VCVTTPD2DQS +define <8 x i32> @test_signed_v8i32_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v8i32_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dqs %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f) + ret <8 x i32> %x +} + +; VCVTTPD2QQS +define <8 x i64> @test_signed_v8i64_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v8i64_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> %f) + ret <8 x i64> %x +} + +; VCVTTPD2UDQS +define <8 x i32> @test_unsigned_v8i32_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i32_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udqs %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> %f) + ret <8 x i32> %x +} + +; VCVTTPD2UQQS +define <8 x i64> @test_unsigned_v8i64_v8f64(<8 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i64_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> %f) + ret <8 x i64> %x +} + +; VCVTTPS2DQS +define <16 x i32> @test_signed_v16i32_v16f32(<16 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v16i32_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> %f) + ret <16 x i32> %x +} + +; VCVTTPS2UDQS +define <16 x i32> @test_unsigned_v16i32_v16f32(<16 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v16i32_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udqs %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> %f) + ret <16 x i32> %x +} +; VCVTTPS2QQS +define <8 x i64> @test_signed_v8i64_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v8i64_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qqs %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> %f) + ret <8 x i64> %x +} + +; VCVTTPS2UQQS +define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i64_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqqs %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f) + ret <8 x i64> %x +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64: {{.*}} +; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll index 494e4bc8e068..a2f167e94cc2 100644 --- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll +++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64 ; ; 32-bit float to signed integer @@ -112,3 +112,157 @@ define i64 @test_signed_i64_f64(double %f) nounwind { %x = call i64 @llvm.fptosi.sat.i64.f64(double %f) ret i64 %x } + +; VCVTTPD2DQS +define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %d) nounwind { +; CHECK-LABEL: test_signed_v2i32_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %d) + ret <2 x i32> %x +} + +define <4 x i32> @test_signed_v4i32_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v4i32_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dqs %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> %f) + ret <4 x i32> %x +} + +; VCVTTPD2QQS +define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v2i64_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_signed_v4i64_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_signed_v4i64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> %f) + ret <4 x i64> %x +} + +; VCVTTPD2UDQS +define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %d) nounwind { +; CHECK-LABEL: test_unsigned_v2i32_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %d) + ret <2 x i32> %x +} + +define <4 x i32> @test_unsigned_v4i32_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i32_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udqs %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> %f) + ret <4 x i32> %x +} + +; VCVTTPD2UQQS +define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v2i64_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_unsigned_v4i64_v4f64(<4 x double> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> %f) + ret <4 x i64> %x +} + +; VCVTTPS2DQS +define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v4i32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f) + ret <4 x i32> %x +} + +define <8 x i32> @test_signed_v8i32_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v8i32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> %f) + ret <8 x i32> %x +} + +; VCVTTPS2UDQS +define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f) + ret <4 x i32> %x +} + +define <8 x i32> @test_unsigned_v8i32_v8f32(<8 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udqs %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> %f) + ret <8 x i32> %x +} + +; VCVTTPS2QQS +define <2 x i64> @test_signed_v2i64_v2f32(<2 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v2i64_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_signed_v4i64_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qqs %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> %f) + ret <4 x i64> %x +} + +; VCVTTPS2UQQS +define <2 x i64> @test_unsigned_v2i64_v2f32(<2 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v2i64_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqqs %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> %f) + ret <2 x i64> %x +} + +define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind { +; CHECK-LABEL: test_unsigned_v4i64_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqqs %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f) + ret <4 x i64> %x +} diff --git a/llvm/test/CodeGen/X86/avx512-i1test.ll b/llvm/test/CodeGen/X86/avx512-i1test.ll index 3cd733181599..d8683df5cbf7 100644 --- a/llvm/test/CodeGen/X86/avx512-i1test.ll +++ b/llvm/test/CodeGen/X86/avx512-i1test.ll @@ -21,20 +21,20 @@ define void @func() { ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jmp .LBB0_2 bb1: - br i1 undef, label %L_10, label %L_10 + br i1 poison, label %L_10, label %L_10 L_10: ; preds = %bb1, %bb1 - br i1 undef, label %L_30, label %bb56 + br i1 poison, label %L_30, label %bb56 bb56: ; preds = %L_10 br label %bb33 bb33: ; preds = %bb51, %bb56 %r111 = load i64, ptr undef, align 8 - br i1 undef, label %bb51, label %bb35 + br i1 poison, label %bb51, label %bb35 bb35: ; preds = %bb33 - br i1 undef, label %L_19, label %bb37 + br i1 poison, label %L_19, label %bb37 bb37: ; preds = %bb35 %r128 = and i64 %r111, 576460752303423488 @@ -43,7 +43,7 @@ bb37: ; preds = %bb35 L_19: ; preds = %bb37, %bb35 %"$V_S25.0" = phi i1 [ %phitmp, %bb37 ], [ true, %bb35 ] - br i1 undef, label %bb51, label %bb42 + br i1 poison, label %bb51, label %bb42 bb42: ; preds = %L_19 %r136 = select i1 %"$V_S25.0", ptr undef, ptr undef diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll index 675293410dfe..136913141305 100644 --- a/llvm/test/CodeGen/X86/block-placement.ll +++ b/llvm/test/CodeGen/X86/block-placement.ll @@ -312,7 +312,7 @@ exit: ret i32 %sum } -define void @unnatural_cfg1() { +define void @unnatural_cfg1(i1 %arg) { ; Test that we can handle a loop with an inner unnatural loop at the end of ; a function. This is a gross CFG reduced out of the single source GCC. ; CHECK-LABEL: unnatural_cfg1 @@ -327,7 +327,7 @@ loop.header: br label %loop.body1 loop.body1: - br i1 undef, label %loop.body3, label %loop.body2 + br i1 %arg, label %loop.body3, label %loop.body2 loop.body2: %ptr = load ptr, ptr undef, align 4 @@ -341,14 +341,14 @@ loop.body3: br i1 %comp, label %loop.body4, label %loop.body5 loop.body4: - br i1 undef, label %loop.header, label %loop.body5 + br i1 %arg, label %loop.header, label %loop.body5 loop.body5: %ptr2 = load ptr, ptr undef, align 4 br label %loop.body3 } -define void @unnatural_cfg2(ptr %p0, i32 %a0) { +define void @unnatural_cfg2(ptr %p0, i32 %a0, i1 %arg) { ; Test that we can handle a loop with a nested natural loop *and* an unnatural ; loop. This was reduced from a crash on block placement when run over ; single-source GCC. @@ -372,10 +372,10 @@ loop.header: loop.body1: %val0 = load ptr, ptr undef, align 4 - br i1 undef, label %loop.body2, label %loop.inner1.begin + br i1 %arg, label %loop.body2, label %loop.inner1.begin loop.body2: - br i1 undef, label %loop.body4, label %loop.body3 + br i1 %arg, label %loop.body4, label %loop.body3 loop.body3: %ptr1 = getelementptr inbounds i32, ptr %val0, i32 0 @@ -467,7 +467,7 @@ exit: ret i32 %merge } -define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0) { +define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0, i1 %arg) { ; This function's CFG contains an once-unanalyzable branch (une on floating ; points). As now it becomes analyzable, we should get best layout in which each ; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is @@ -493,7 +493,7 @@ entry.if.then_crit_edge: br label %if.then lor.lhs.false: - br i1 undef, label %if.end, label %exit + br i1 %arg, label %if.end, label %exit exit: %cmp.i = fcmp une double 0.000000e+00, %a0 @@ -516,7 +516,7 @@ declare i32 @f() declare i32 @g() declare i32 @h(i32 %x) -define i32 @test_global_cfg_break_profitability() { +define i32 @test_global_cfg_break_profitability(i1 %arg) { ; Check that our metrics for the profitability of a CFG break are global rather ; than local. A successor may be very hot, but if the current block isn't, it ; doesn't matter. Within this test the 'then' block is slightly warmer than the @@ -530,7 +530,7 @@ define i32 @test_global_cfg_break_profitability() { ; CHECK: ret entry: - br i1 undef, label %then, label %else, !prof !2 + br i1 %arg, label %then, label %else, !prof !2 then: %then.result = call i32 @f() @@ -600,7 +600,7 @@ cleanup: unreachable } -define void @test_unnatural_cfg_backwards_inner_loop() { +define void @test_unnatural_cfg_backwards_inner_loop(i1 %arg) { ; Test that when we encounter an unnatural CFG structure after having formed ; a chain for an inner loop which happened to be laid out backwards we don't ; attempt to merge onto the wrong end of the inner loop just because we find it @@ -612,7 +612,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() { ; CHECK: %loop3 entry: - br i1 undef, label %loop2a, label %body + br i1 %arg, label %loop2a, label %body body: br label %loop2a @@ -692,7 +692,7 @@ exit: ret void } -define void @unanalyzable_branch_to_free_block(float %x) { +define void @unanalyzable_branch_to_free_block(float %x, i1 %arg) { ; Ensure that we can handle unanalyzable branches where the destination block ; gets selected as the best free block in the CFG. ; @@ -704,7 +704,7 @@ define void @unanalyzable_branch_to_free_block(float %x) { ; CHECK: %exit entry: - br i1 undef, label %a, label %b + br i1 %arg, label %a, label %b a: call i32 @f() diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll index f6b38839d13c..e7ffc4752715 100644 --- a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll +++ b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll @@ -173,7 +173,7 @@ define ghccc void @test5() { ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: jmp tail@PLT # TAILCALL entry: - br i1 undef, label %then, label %else + br i1 poison, label %then, label %else then: store i64 0, ptr undef @@ -186,4 +186,3 @@ else: exit: ret void } - diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll index c611e89f2786..f3070cd55903 100644 --- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll +++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -o - | FileCheck %s ; This file tests the different cases what are involved when codegen prepare ; tries to get sign/zero extension out of the way of addressing mode. @@ -9,14 +10,17 @@ target triple = "x86_64-apple-macosx" ; Check that we correctly promote both operands of the promotable add. -; CHECK-LABEL: @twoArgsPromotion -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64 -; CHECK: [[ARG2SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg2 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], [[ARG2SEXT]] -; CHECK: inttoptr i64 [[PROMOTED]] to ptr -; CHECK: ret define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) { - %add = add nsw i32 %arg1, %arg2 +; CHECK-LABEL: define i8 @twoArgsPromotion( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) { +; CHECK-NEXT: [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64 +; CHECK-NEXT: [[PROMOTED2:%.*]] = sext i32 [[ARG2]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED]], [[PROMOTED2]] +; CHECK-NEXT: [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[BASE]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add nsw i32 %arg1, %arg2 %sextadd = sext i32 %add to i64 %base = inttoptr i64 %sextadd to ptr %res = load i8, ptr %base @@ -28,11 +32,16 @@ define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) { ; Otherwise, we will increase the number of instruction executed. ; (This is a heuristic of course, because the new sext could have been ; merged with something else.) -; CHECK-LABEL: @twoArgsNoPromotion -; CHECK: add nsw i32 %arg1, %arg2 -; CHECK: ret define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) { - %add = add nsw i32 %arg1, %arg2 +; CHECK-LABEL: define i8 @twoArgsNoPromotion( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]] +; CHECK-NEXT: [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add nsw i32 %arg1, %arg2 %sextadd = sext i32 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -41,11 +50,16 @@ define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) { ; Check that we do not promote when the related instruction does not have ; the nsw flag. -; CHECK-LABEL: @noPromotion -; CHECK-NOT: add i64 -; CHECK: ret define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) { - %add = add i32 %arg1, %arg2 +; CHECK-LABEL: define i8 @noPromotion( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ARG1]], [[ARG2]] +; CHECK-NEXT: [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add i32 %arg1, %arg2 %sextadd = sext i32 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -53,13 +67,16 @@ define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) { } ; Check that we correctly promote constant arguments. -; CHECK-LABEL: @oneArgPromotion -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotion(i32 %arg1, ptr %base) { - %add = add nsw i32 %arg1, 1 +; CHECK-LABEL: define i8 @oneArgPromotion( +; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add nsw i32 %arg1, 1 %sextadd = sext i32 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -67,14 +84,17 @@ define i8 @oneArgPromotion(i32 %arg1, ptr %base) { } ; Check that we are able to merge a sign extension with a zero extension. -; CHECK-LABEL: @oneArgPromotionZExt -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionZExt( +; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %zext = zext i8 %arg1 to i32 - %add = add nsw i32 %zext, 1 + %add = add nsw i32 %zext, 1 %sextadd = sext i32 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -88,11 +108,14 @@ define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) { ; more thing in the addressing mode. Therefore the modification is ; rolled back. ; Still, this test case exercises the desired code path. -; CHECK-LABEL: @oneArgPromotionCstZExt -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 0, 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionCstZExt(ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionCstZExt( +; CHECK-SAME: ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 0, 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %cst = zext i16 undef to i32 %add = add nsw i32 %cst, 1 %sextadd = sext i32 %add to i64 @@ -103,15 +126,18 @@ define i8 @oneArgPromotionCstZExt(ptr %base) { ; Check that we do not promote truncate when we cannot determine the ; bits that are dropped. -; CHECK-LABEL: @oneArgPromotionBlockTrunc1 -; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 %arg1 to i8 -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc1( +; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ARG1]] to i8 +; CHECK-NEXT: [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %trunc = trunc i32 %arg1 to i8 - %add = add nsw i8 %trunc, 1 + %add = add nsw i8 %trunc, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -120,17 +146,20 @@ define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) { ; Check that we do not promote truncate when we cannot determine all the ; bits that are dropped. -; CHECK-LABEL: @oneArgPromotionBlockTrunc2 -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i16 %arg1 to i32 -; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8 -; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc2( +; CHECK-SAME: i16 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[SEXTARG1:%.*]] = sext i16 [[ARG1]] to i32 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8 +; CHECK-NEXT: [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %sextarg1 = sext i16 %arg1 to i32 %trunc = trunc i32 %sextarg1 to i8 - %add = add nsw i8 %trunc, 1 + %add = add nsw i8 %trunc, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -139,15 +168,18 @@ define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) { ; Check that we are able to promote truncate when we know all the bits ; that are dropped. -; CHECK-LABEL: @oneArgPromotionPassTruncKeepSExt -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionPassTruncKeepSExt( +; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED:%.*]] = sext i1 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %sextarg1 = sext i1 %arg1 to i32 %trunc = trunc i32 %sextarg1 to i8 - %add = add nsw i8 %trunc, 1 + %add = add nsw i8 %trunc, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -156,17 +188,19 @@ define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) { ; On X86 truncate are free. Check that we are able to promote the add ; to be used as addressing mode and that we insert a truncate for the other -; use. -; CHECK-LABEL: @oneArgPromotionTruncInsert -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1 -; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8 -; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]] -; CHECK: add i8 [[LOAD]], [[TRUNC]] -; CHECK: ret +; use. define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) { - %add = add nsw i8 %arg1, 1 +; CHECK-LABEL: define i8 @oneArgPromotionTruncInsert( +; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1 +; CHECK-NEXT: [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]] +; CHECK-NEXT: ret i8 [[FINALRES]] +; + %add = add nsw i8 %arg1, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -175,15 +209,20 @@ define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) { } ; Cannot sext from a larger type than the promoted type. -; CHECK-LABEL: @oneArgPromotionLargerType -; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i128 %arg1 to i8 -; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionLargerType( +; CHECK-SAME: i128 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i128 [[ARG1]] to i8 +; CHECK-NEXT: [[PROMOTED2:%.*]] = sext i8 [[TRUNC]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1 +; CHECK-NEXT: [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]] +; CHECK-NEXT: ret i8 [[FINALRES]] +; %trunc = trunc i128 %arg1 to i8 - %add = add nsw i8 %trunc, 1 + %add = add nsw i8 %trunc, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -194,18 +233,20 @@ define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) { ; Use same inserted trunc ; On X86 truncate are free. Check that we are able to promote the add ; to be used as addressing mode and that we insert a truncate for -; *all* the other uses. -; CHECK-LABEL: @oneArgPromotionTruncInsertSeveralUse -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1 -; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8 -; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]] -; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = add i8 [[LOAD]], [[TRUNC]] -; CHECK: add i8 [[ADDRES]], [[TRUNC]] -; CHECK: ret +; *all* the other uses. define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) { - %add = add nsw i8 %arg1, 1 +; CHECK-LABEL: define i8 @oneArgPromotionTruncInsertSeveralUse( +; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1 +; CHECK-NEXT: [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ALMOSTFINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]] +; CHECK-NEXT: [[FINALRES:%.*]] = add i8 [[ALMOSTFINALRES]], [[PROMOTED]] +; CHECK-NEXT: ret i8 [[FINALRES]] +; + %add = add nsw i8 %arg1, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -216,16 +257,18 @@ define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) { ; Check that the promoted instruction is used for all uses of the original ; sign extension. -; CHECK-LABEL: @oneArgPromotionSExtSeveralUse -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1 -; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]] -; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = zext i8 [[LOAD]] to i64 -; CHECK: add i64 [[ADDRES]], [[PROMOTED]] -; CHECK: ret define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) { - %add = add nsw i8 %arg1, 1 +; CHECK-LABEL: define i64 @oneArgPromotionSExtSeveralUse( +; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED:%.*]] = sext i8 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ALMOSTFINALRES:%.*]] = zext i8 [[RES]] to i64 +; CHECK-NEXT: [[FINALRES:%.*]] = add i64 [[ALMOSTFINALRES]], [[ADD]] +; CHECK-NEXT: ret i64 [[FINALRES]] +; + %add = add nsw i8 %arg1, 1 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -249,16 +292,19 @@ define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) { ; - Setting the operands of the promoted instruction with the promoted values. ; - Moving instruction around (mainly sext when promoting instruction). ; Each type of those promotions has to be undo at least once during this -; specific test. -; CHECK-LABEL: @twoArgsPromotionNest -; CHECK: [[ORIG:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2 -; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ORIG]], [[ORIG]] -; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]] -; CHECK: ret +; specific test. define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) { +; CHECK-LABEL: define i8 @twoArgsPromotionNest( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTABLEADD1:%.*]] = add nsw i32 [[ARG1]], [[ARG2]] +; CHECK-NEXT: [[PROMOTABLEADD2:%.*]] = add nsw i32 [[PROMOTABLEADD1]], [[PROMOTABLEADD1]] +; CHECK-NEXT: [[SEXTADD:%.*]] = sext i32 [[PROMOTABLEADD2]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %promotableadd1 = add nsw i32 %arg1, %arg2 - %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 + %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 %sextadd = sext i32 %promotableadd2 to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -270,18 +316,21 @@ define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) { ; The matcher first promotes the add, removes the trunc and promotes ; the sext of arg1. ; Then, the matcher cannot use an addressing mode r + r + r, thus it -; rolls back. -; CHECK-LABEL: @twoArgsNoPromotionRemove -; CHECK: [[SEXTARG1:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32 -; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[SEXTARG1]] to i8 -; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[TRUNC]], %arg2 -; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i64 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]] -; CHECK: ret +; rolls back. define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) { +; CHECK-LABEL: define i8 @twoArgsNoPromotionRemove( +; CHECK-SAME: i1 [[ARG1:%.*]], i8 [[ARG2:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i8 [[TRUNC]], [[ARG2]] +; CHECK-NEXT: [[SEXTADD:%.*]] = sext i8 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %sextarg1 = sext i1 %arg1 to i32 %trunc = trunc i32 %sextarg1 to i8 - %add = add nsw i8 %trunc, %arg2 + %add = add nsw i8 %trunc, %arg2 %sextadd = sext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd %res = load i8, ptr %arrayidx @@ -301,29 +350,40 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) { ; Check that we did not promote anything in the final matching. ; ; <rdar://problem/16020230> -; CHECK-LABEL: @checkProfitability -; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64 -; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64 -; CHECK: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1 -; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2 -; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64 ; BB then -; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr -; CHECK: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE1]], i64 48 -; CHECK: load i32, ptr [[FULL1]] ; BB else -; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr -; CHECK: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE2]], i64 48 -; CHECK: load i32, ptr [[FULL2]] -; CHECK: ret define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) { +; CHECK-LABEL: define i32 @checkProfitability( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i1 [[TEST:%.*]]) { +; CHECK-NEXT: [[SHL:%.*]] = shl nsw i32 [[ARG1]], 1 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[SHL]], [[ARG2]] +; CHECK-NEXT: [[SEXTIDX1:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: br i1 [[TEST]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[SUNKADDR:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr +; CHECK-NEXT: [[SUNKADDR13:%.*]] = getelementptr i8, ptr [[SUNKADDR]], i64 48 +; CHECK-NEXT: [[RES1:%.*]] = load i32, ptr [[SUNKADDR13]], align 4 +; CHECK-NEXT: br label %[[END:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[SUNKADDR17:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr +; CHECK-NEXT: [[SUNKADDR18:%.*]] = getelementptr i8, ptr [[SUNKADDR17]], i64 48 +; CHECK-NEXT: [[RES2:%.*]] = load i32, ptr [[SUNKADDR18]], align 4 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ [[RES1]], %[[THEN]] ], [ [[RES2]], %[[ELSE]] ] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[SEXTIDX1]] to i32 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[TMP]], [[TMP1]] +; CHECK-NEXT: [[ADDR:%.*]] = inttoptr i32 [[RES]] to ptr +; CHECK-NEXT: [[FINAL:%.*]] = load i32, ptr [[ADDR]], align 4 +; CHECK-NEXT: ret i32 [[FINAL]] +; %shl = shl nsw i32 %arg1, 1 %add1 = add nsw i32 %shl, %arg2 %sextidx1 = sext i32 %add1 to i64 %tmpptr = inttoptr i64 %sextidx1 to ptr %arrayidx1 = getelementptr i32, ptr %tmpptr, i64 12 br i1 %test, label %then, label %else -then: +then: %res1 = load i32, ptr %arrayidx1 br label %end else: @@ -346,15 +406,47 @@ end: ; We used to crash on this function because we did not return the right ; promoted instruction for %conv.i. ; Make sure we generate the right code now. -; CHECK-LABEL: @fn3 ; %conv.i is used twice and only one of its use is being promoted. ; Use it at the starting point for the matching. -; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32 -; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64 -; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr %P, i64 [[PROMOTED_CONV]] -; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[ADD]], i64 7 -; CHECK-NEXT: load i8, ptr [[ADDR]], align 1 define signext i16 @fn3(ptr nocapture readonly %P) { +; CHECK-LABEL: define signext i16 @fn3( +; CHECK-SAME: ptr nocapture readonly [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[WHILE_BODY_I_I:.*]] +; CHECK: [[WHILE_BODY_I_I]]: +; CHECK-NEXT: [[SRC_ADDR_0_I_I:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[INC_I_I:%.*]], %[[WHILE_BODY_I_I]] ] +; CHECK-NEXT: [[INC_I_I]] = add i16 [[SRC_ADDR_0_I_I]], 1 +; CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i16 [[SRC_ADDR_0_I_I]] to i64 +; CHECK-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IDXPROM_I_I]] +; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr inbounds i8, ptr [[SUNKADDR]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SUNKADDR2]], align 1 +; CHECK-NEXT: [[CONV2_I_I:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[AND_I_I:%.*]] = and i32 [[CONV2_I_I]], 15 +; CHECK-NEXT: store i32 [[AND_I_I]], ptr @a, align 4 +; CHECK-NEXT: [[TOBOOL_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_I_I]], label %[[WHILE_BODY_I_I]], label %[[FN1_EXIT_I:.*]] +; CHECK: [[FN1_EXIT_I]]: +; CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[INC_I_I]] to i32 +; CHECK-NEXT: [[PROMOTED4:%.*]] = zext i16 [[INC_I_I]] to i64 +; CHECK-NEXT: [[SUNKADDR5:%.*]] = getelementptr i8, ptr [[P]], i64 [[PROMOTED4]] +; CHECK-NEXT: [[SUNKADDR6:%.*]] = getelementptr i8, ptr [[SUNKADDR5]], i64 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[SUNKADDR6]], align 1 +; CHECK-NEXT: [[CONV2_I:%.*]] = sext i8 [[TMP2]] to i16 +; CHECK-NEXT: store i16 [[CONV2_I]], ptr @b, align 2 +; CHECK-NEXT: [[SUB4_I:%.*]] = sub nsw i32 0, [[CONV_I]] +; CHECK-NEXT: [[CONV5_I:%.*]] = zext i16 [[CONV2_I]] to i32 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[CONV5_I]], [[SUB4_I]] +; CHECK-NEXT: br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[FN2_EXIT:.*]] +; CHECK: [[IF_THEN_I]]: +; CHECK-NEXT: [[END_I:%.*]] = getelementptr inbounds [[STRUCT_DNS_PACKET:%.*]], ptr [[P]], i64 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[END_I]], align 4 +; CHECK-NEXT: [[SUB7_I:%.*]] = add i32 [[TMP3]], 65535 +; CHECK-NEXT: [[CONV8_I:%.*]] = trunc i32 [[SUB7_I]] to i16 +; CHECK-NEXT: br label %[[FN2_EXIT]] +; CHECK: [[FN2_EXIT]]: +; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi i16 [ [[CONV8_I]], %[[IF_THEN_I]] ], [ undef, %[[FN1_EXIT_I]] ] +; CHECK-NEXT: ret i16 [[RETVAL_0_I]] +; entry: %tmp = getelementptr inbounds %struct.dns_packet, ptr %P, i64 0, i32 2 br label %while.body.i.i @@ -399,13 +491,16 @@ fn2.exit: ; preds = %if.then.i, %fn1.exi ; Check that we do not promote an extension if the non-wrapping flag does not ; match the kind of the extension. -; CHECK-LABEL: @noPromotionFlag -; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64 -; CHECK: inttoptr i64 [[PROMOTED]] to ptr -; CHECK: ret define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) { - %add = add nsw i32 %arg1, %arg2 +; CHECK-LABEL: define i8 @noPromotionFlag( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) { +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]] +; CHECK-NEXT: [[ZEXTADD:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: [[BASE:%.*]] = inttoptr i64 [[ZEXTADD]] to ptr +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[BASE]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add nsw i32 %arg1, %arg2 %zextadd = zext i32 %add to i64 %base = inttoptr i64 %zextadd to ptr %res = load i8, ptr %base @@ -413,14 +508,17 @@ define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) { } ; Check that we correctly promote both operands of the promotable add with zext. -; CHECK-LABEL: @twoArgsPromotionZExt -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64 -; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]] -; CHECK: inttoptr i64 [[PROMOTED]] to ptr -; CHECK: ret define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) { - %add = add nuw i32 %arg1, %arg2 +; CHECK-LABEL: define i8 @twoArgsPromotionZExt( +; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) { +; CHECK-NEXT: [[PROMOTED:%.*]] = zext i32 [[ARG1]] to i64 +; CHECK-NEXT: [[PROMOTED2:%.*]] = zext i32 [[ARG2]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i64 [[PROMOTED]], [[PROMOTED2]] +; CHECK-NEXT: [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[BASE]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add nuw i32 %arg1, %arg2 %zextadd = zext i32 %add to i64 %base = inttoptr i64 %zextadd to ptr %res = load i8, ptr %base @@ -428,13 +526,16 @@ define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) { } ; Check that we correctly promote constant arguments. -; CHECK-LABEL: @oneArgPromotionNegativeCstZExt -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) { - %add = add nuw i8 %arg1, -1 +; CHECK-LABEL: define i8 @oneArgPromotionNegativeCstZExt( +; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED:%.*]] = zext i8 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 255 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; + %add = add nuw i8 %arg1, -1 %zextadd = zext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd %res = load i8, ptr %arrayidx @@ -442,14 +543,17 @@ define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) { } ; Check that we are able to merge two zero extensions. -; CHECK-LABEL: @oneArgPromotionZExtZExt -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionZExtZExt( +; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %zext = zext i8 %arg1 to i32 - %add = add nuw i32 %zext, 1 + %add = add nuw i32 %zext, 1 %zextadd = zext i32 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd %res = load i8, ptr %arrayidx @@ -458,17 +562,20 @@ define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) { ; Check that we do not promote truncate when the dropped bits ; are of a different kind. -; CHECK-LABEL: @oneArgPromotionBlockTruncZExt -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32 -; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8 -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionBlockTruncZExt( +; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8 +; CHECK-NEXT: [[PROMOTED:%.*]] = zext i8 [[TRUNC]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %sextarg1 = sext i1 %arg1 to i32 %trunc = trunc i32 %sextarg1 to i8 - %add = add nuw i8 %trunc, 1 + %add = add nuw i8 %trunc, 1 %zextadd = zext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd %res = load i8, ptr %arrayidx @@ -477,15 +584,18 @@ define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) { ; Check that we are able to promote truncate when we know all the bits ; that are dropped. -; CHECK-LABEL: @oneArgPromotionPassTruncZExt -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionPassTruncZExt( +; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[PROMOTED2:%.*]] = zext i1 [[ARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %sextarg1 = zext i1 %arg1 to i32 %trunc = trunc i32 %sextarg1 to i8 - %add = add nuw i8 %trunc, 1 + %add = add nuw i8 %trunc, 1 %zextadd = zext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd %res = load i8, ptr %arrayidx @@ -493,15 +603,18 @@ define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) { } ; Check that we do not promote sext with zext. -; CHECK-LABEL: @oneArgPromotionBlockSExtZExt -; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8 -; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64 -; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 -; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]] -; CHECK: ret define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, ptr %base) { +; CHECK-LABEL: define i8 @oneArgPromotionBlockSExtZExt( +; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) { +; CHECK-NEXT: [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i8 +; CHECK-NEXT: [[PROMOTED:%.*]] = zext i8 [[SEXTARG1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret i8 [[RES]] +; %sextarg1 = sext i1 %arg1 to i8 - %add = add nuw i8 %sextarg1, 1 + %add = add nuw i8 %sextarg1, 1 %zextadd = zext i8 %add to i64 %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd %res = load i8, ptr %arrayidx diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir index aceb344d8b76..13f3f3ad4187 100644 --- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir +++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir @@ -6,7 +6,7 @@ # "Replacement PHI node is already replaced." --- | - define void @f1() { + define void @f1(i1 %arg) { entry: %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2 br label %for.cond @@ -30,10 +30,10 @@ %5 = phi ptr [ %4, %for.body ], [ %5, %if.then5 ], [ undef, %for.cond2 ] %6 = phi ptr [ %3, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ] %7 = phi ptr [ %2, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ] - br i1 undef, label %for.cond2, label %if.then5 + br i1 %arg, label %for.cond2, label %if.then5 if.then5: - br i1 undef, label %cleanup, label %for.cond2 + br i1 %arg, label %cleanup, label %for.cond2 cleanup: br i1 true, label %for.cond, label %for.body diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir index 6159aa8a42e2..e93e04bfd443 100644 --- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir +++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir @@ -7,7 +7,7 @@ --- | - define void @f1() { + define void @f1(i1 %arg) { entry: %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2 br label %for.cond @@ -24,7 +24,7 @@ %2 = phi ptr [ %1, %for.cond ], [ %12, %cleanup ] %3 = phi ptr [ %0, %for.cond ], [ %11, %cleanup ] %4 = phi ptr [ %0, %for.cond ], [ %10, %cleanup ] - br i1 undef, label %for.cond2.preheader, label %if.then + br i1 %arg, label %for.cond2.preheader, label %if.then for.cond2.preheader: br label %for.cond2 @@ -37,7 +37,7 @@ %5 = phi ptr [ %8, %for.inc ], [ %4, %for.cond2.preheader ] %6 = phi ptr [ %9, %for.inc ], [ %3, %for.cond2.preheader ] %7 = phi ptr [ %9, %for.inc ], [ %2, %for.cond2.preheader ] - br i1 undef, label %for.inc, label %if.then5 + br i1 %arg, label %for.inc, label %if.then5 if.then5: br i1 true, label %cleanup.loopexit, label %if.end diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll index 230afd146193..7237b02ca6b6 100644 --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -72,7 +72,7 @@ alloca_0: br label %loop.4942 loop.4942: ; preds = %loop.4942, %alloca_0 - br i1 undef, label %loop.4942, label %ifmerge.1298 + br i1 poison, label %loop.4942, label %ifmerge.1298 ifmerge.1298: ; preds = %loop.4942 %gepload4638 = load float, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 28324), align 4 diff --git a/llvm/test/CodeGen/X86/crash.ll b/llvm/test/CodeGen/X86/crash.ll index 16e3bb6e50ae..2f49a60a26f4 100644 --- a/llvm/test/CodeGen/X86/crash.ll +++ b/llvm/test/CodeGen/X86/crash.ll @@ -115,9 +115,9 @@ do.body92: ; preds = %if.then66 ; Crash during XOR optimization. ; <rdar://problem/7869290> -define void @test7() nounwind ssp { +define void @test7(i1 %arg) nounwind ssp { entry: - br i1 undef, label %bb14, label %bb67 + br i1 %arg, label %bb14, label %bb67 bb14: %tmp0 = trunc i16 undef to i1 @@ -157,14 +157,14 @@ entry: ; shift of and. %struct.S0 = type { i8, [2 x i8], i8 } -define void @func_59(i32 %p_63) noreturn nounwind { +define void @func_59(i32 %p_63, i1 %arg) noreturn nounwind { entry: br label %for.body for.body: ; preds = %for.inc44, %entry %p_63.addr.1 = phi i32 [ %p_63, %entry ], [ 0, %for.inc44 ] %l_74.0 = phi i32 [ 0, %entry ], [ %add46, %for.inc44 ] - br i1 undef, label %for.inc44, label %bb.nph81 + br i1 %arg, label %for.inc44, label %bb.nph81 bb.nph81: ; preds = %for.body %tmp98 = add i32 %p_63.addr.1, 0 @@ -237,7 +237,7 @@ declare i64 @llvm.objectsize.i64.p0(ptr, i1) nounwind readnone %t20 = type { i32, i32 } %t21 = type { ptr } -define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE() ssp align 2 { +define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE(i1 %arg) ssp align 2 { bb: %tmp = load ptr, ptr undef, align 4 %tmp3 = getelementptr inbounds %t9, ptr %tmp, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -246,7 +246,7 @@ bb: bb4: ; preds = %bb37, %bb %tmp5 = phi i96 [ undef, %bb ], [ %tmp38, %bb37 ] %tmp6 = phi i96 [ undef, %bb ], [ %tmp39, %bb37 ] - br i1 undef, label %bb34, label %bb7 + br i1 %arg, label %bb34, label %bb7 bb7: ; preds = %bb4 %tmp8 = load i32, ptr undef, align 4 @@ -292,7 +292,7 @@ bb33: ; preds = %bb29 unreachable bb34: ; preds = %bb4 - br i1 undef, label %bb36, label %bb35 + br i1 %arg, label %bb36, label %bb35 bb35: ; preds = %bb34 store ptr null, ptr %tmp3, align 4 @@ -319,7 +319,7 @@ declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind ; PR10463 ; Spilling a virtual register with <undef> uses. -define void @autogen_239_1000() { +define void @autogen_239_1000(i1 %arg) { BB: %Shuff = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 undef> br label %CF @@ -327,14 +327,14 @@ BB: CF: %B16 = frem <8 x double> zeroinitializer, %Shuff %E19 = extractelement <8 x double> %Shuff, i32 5 - br i1 undef, label %CF, label %CF75 + br i1 %arg, label %CF, label %CF75 CF75: - br i1 undef, label %CF75, label %CF76 + br i1 %arg, label %CF75, label %CF76 CF76: store double %E19, ptr undef - br i1 undef, label %CF76, label %CF77 + br i1 %arg, label %CF76, label %CF77 CF77: %B55 = fmul <8 x double> %B16, undef @@ -396,24 +396,24 @@ if.end: ; InstrEmitter::EmitSubregNode() may steal virtual registers from already ; emitted blocks when isCoalescableExtInstr points out the opportunity. ; Make sure kill flags are cleared on the newly global virtual register. -define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream) nounwind uwtable ssp { +define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream, i1 %arg) nounwind uwtable ssp { entry: - br i1 undef, label %return, label %while.body.preheader + br i1 %arg, label %return, label %while.body.preheader while.body.preheader: ; preds = %entry - br i1 undef, label %if.then3, label %if.end7 + br i1 %arg, label %if.then3, label %if.end7 if.then3: ; preds = %while.body.preheader %0 = load i32, ptr undef, align 4 - br i1 undef, label %land.lhs.true.i255, label %if.end7 + br i1 %arg, label %land.lhs.true.i255, label %if.end7 land.lhs.true.i255: ; preds = %if.then3 - br i1 undef, label %if.then.i256, label %if.end7 + br i1 %arg, label %if.then.i256, label %if.end7 if.then.i256: ; preds = %land.lhs.true.i255 %sub.i = sub i32 0, %0 %conv = sext i32 %sub.i to i64 - br i1 undef, label %if.end7, label %while.end + br i1 %arg, label %if.end7, label %while.end if.end7: ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader unreachable @@ -486,12 +486,12 @@ declare void @fn3(...) ; When coalescing %1 and %2, the IMPLICIT_DEF instruction should be ; erased along with its value number. ; -define void @rdar12474033() nounwind ssp { +define void @rdar12474033(i1 %arg, i32 %arg2, i32 %arg3, i32 %arg4) nounwind ssp { bb: - br i1 undef, label %bb21, label %bb1 + br i1 %arg, label %bb21, label %bb1 bb1: ; preds = %bb - switch i32 undef, label %bb10 [ + switch i32 %arg2, label %bb10 [ i32 4, label %bb2 i32 1, label %bb9 i32 5, label %bb3 @@ -503,7 +503,7 @@ bb2: ; preds = %bb1 unreachable bb3: ; preds = %bb1, %bb1 - br i1 undef, label %bb4, label %bb5 + br i1 %arg, label %bb4, label %bb5 bb4: ; preds = %bb3 unreachable @@ -521,7 +521,7 @@ bb9: ; preds = %bb1, %bb1 bb10: ; preds = %bb5, %bb1 %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ] %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ] - switch i32 undef, label %bb21 [ + switch i32 %arg3, label %bb21 [ i32 2, label %bb18 i32 3, label %bb13 i32 5, label %bb16 @@ -530,7 +530,7 @@ bb10: ; preds = %bb5, %bb1 ] bb13: ; preds = %bb10 - br i1 undef, label %bb15, label %bb14 + br i1 %arg, label %bb15, label %bb14 bb14: ; preds = %bb13 br label %bb21 @@ -554,7 +554,7 @@ bb21: ; preds = %bb18, %bb14, %bb10, %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ] store <4 x float> %tmp23, ptr undef, align 16 store <4 x float> %tmp22, ptr undef, align 16 - switch i32 undef, label %bb29 [ + switch i32 %arg4, label %bb29 [ i32 5, label %bb27 i32 1, label %bb24 i32 2, label %bb25 diff --git a/llvm/test/CodeGen/X86/domain-reassignment-test.ll b/llvm/test/CodeGen/X86/domain-reassignment-test.ll index af7aca67c8fa..77c1ef256cf0 100644 --- a/llvm/test/CodeGen/X86/domain-reassignment-test.ll +++ b/llvm/test/CodeGen/X86/domain-reassignment-test.ll @@ -3,7 +3,7 @@ ; Check that the X86 domain reassignment pass doesn't introduce an illegal ; test instruction. See PR37396 -define void @japi1_foo2_34617() { +define void @japi1_foo2_34617(i1 %arg) { pass2: br label %if5 @@ -27,7 +27,7 @@ if5: %tmp120 = and i1 %tmp118, %tmp119 %tmp121 = zext i1 %tmp120 to i8 %tmp122 = and i8 %b.055, %tmp121 - br i1 undef, label %L174, label %if5 + br i1 %arg, label %L174, label %if5 L188: unreachable diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll index 8d8d4fa699aa..4a5cddb30e03 100644 --- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll +++ b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll @@ -5,9 +5,9 @@ ; The machine verifier will catch and complain about this case. ; CHECK-LABEL: baz ; CHECK: retq -define void @baz() { +define void @baz(i1 %arg) { entry: - br i1 undef, label %exit, label %exit + br i1 %arg, label %exit, label %exit exit: ret void diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index c6da0c5ca479..1dcce5336895 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 declare float @llvm.maximum.f32(float, float) @@ -73,6 +74,11 @@ define float @test_fmaximum(float %x, float %y) nounwind { ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $1, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -110,6 +116,11 @@ define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no- ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_scalarize: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_scalarize: ; X86: # %bb.0: ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 @@ -129,6 +140,11 @@ define float @test_fmaximum_nan0(float %x, float %y) { ; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_nan0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_nan0: ; X86: # %bb.0: ; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} @@ -148,6 +164,11 @@ define float @test_fmaximum_nan1(float %x, float %y) { ; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_nan1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_nan1: ; X86: # %bb.0: ; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} @@ -215,6 +236,13 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind { ; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_nnan: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX10_2-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: vminmaxss $1, %xmm0, %xmm2 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_nnan: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -272,6 +300,12 @@ define double @test_fmaximum_zero0(double %x, double %y) nounwind { ; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_zero0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX10_2-NEXT: vminmaxsd $1, %xmm0, %xmm1 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_zero0: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp @@ -323,6 +357,12 @@ define double @test_fmaximum_zero1(double %x, double %y) nounwind { ; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_zero1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxsd $1, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_zero1: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp @@ -354,6 +394,11 @@ define double @test_fmaximum_zero2(double %x, double %y) { ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_zero2: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_zero2: ; X86: # %bb.0: ; X86-NEXT: fldz @@ -390,6 +435,11 @@ define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="t ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_nsz: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $1, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_nsz: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -474,6 +524,12 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { ; AVX512DQ-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_combine_cmps: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxss $1, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_combine_cmps: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -562,6 +618,11 @@ define float @test_fminimum(float %x, float %y) nounwind { ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $0, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -599,6 +660,11 @@ define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) " ; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_scalarize: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_scalarize: ; X86: # %bb.0: ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 @@ -618,6 +684,11 @@ define float @test_fminimum_nan0(float %x, float %y) { ; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_nan0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_nan0: ; X86: # %bb.0: ; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} @@ -637,6 +708,11 @@ define float @test_fminimum_nan1(float %x, float %y) { ; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_nan1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_nan1: ; X86: # %bb.0: ; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} @@ -695,6 +771,11 @@ define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true" ; AVX512DQ-NEXT: vminsd %xmm2, %xmm1, %xmm0 ; AVX512DQ-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_nnan: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxsd $0, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_nnan: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp @@ -749,6 +830,11 @@ define double @test_fminimum_zero0(double %x, double %y) nounwind { ; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_zero0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_zero0: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp @@ -796,6 +882,11 @@ define double @test_fminimum_zero1(double %x, double %y) nounwind { ; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_zero1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_zero1: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp @@ -826,6 +917,11 @@ define double @test_fminimum_zero2(double %x, double %y) { ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_zero2: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_zero2: ; X86: # %bb.0: ; X86-NEXT: fldz @@ -863,6 +959,11 @@ define float @test_fminimum_nsz(float %x, float %y) nounwind { ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_nsz: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $0, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_nsz: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -948,6 +1049,12 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind { ; AVX512DQ-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_combine_cmps: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxss $0, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_combine_cmps: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -1009,6 +1116,11 @@ define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) { ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector: ; X86: # %bb.0: ; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 @@ -1032,6 +1144,11 @@ define <4 x float> @test_fmaximum_vector(<4 x float> %x, <4 x float> %y) "no-nan ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_vector: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_vector: ; X86: # %bb.0: ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 @@ -1054,6 +1171,12 @@ define <2 x double> @test_fminimum_vector_zero(<2 x double> %x) { ; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector_zero: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 @@ -1077,6 +1200,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) { ; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_vector_signed_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_vector_signed_zero: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] @@ -1102,6 +1230,13 @@ define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) { ; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector_partially_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector_partially_zero: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 @@ -1149,6 +1284,13 @@ define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) { ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector_different_zeros: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector_different_zeros: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 @@ -1177,6 +1319,11 @@ define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) { ; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_vector_non_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_vector_non_zero: ; X86: # %bb.0: ; X86-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] @@ -1206,6 +1353,13 @@ define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) { ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector_nan: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector_nan: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 @@ -1232,6 +1386,12 @@ define <2 x double> @test_fminimum_vector_zero_first(<2 x double> %x) { ; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector_zero_first: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector_zero_first: ; X86: # %bb.0: ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 @@ -1260,6 +1420,11 @@ define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) { ; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fminimum_vector_signed_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxpd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fminimum_vector_signed_zero: ; X86: # %bb.0: ; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 @@ -1284,6 +1449,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) { ; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_vector_signed_zero_first: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_vector_signed_zero_first: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] @@ -1314,6 +1484,12 @@ define <4 x float> @test_fmaximum_vector_zero(<4 x float> %x) { ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_vector_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_vector_zero: ; X86: # %bb.0: ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -1369,6 +1545,12 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) { ; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_v4f32_splat: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_v4f32_splat: ; X86: # %bb.0: ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 @@ -1803,6 +1985,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_v4f16: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxph $1, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_v4f16: ; X86: # %bb.0: ; X86-NEXT: subl $164, %esp @@ -2330,6 +2517,11 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq ; +; AVX10_2-LABEL: test_fmaximum_v4bf16: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxnepbf16 $1, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; ; X86-LABEL: test_fmaximum_v4bf16: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll new file mode 100644 index 000000000000..2e9e8e62b356 --- /dev/null +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -0,0 +1,2765 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 + +declare float @llvm.maximumnum.f32(float, float) +declare double @llvm.maximumnum.f64(double, double) +declare float @llvm.minimumnum.f32(float, float) +declare double @llvm.minimumnum.f64(double, double) +declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>) +declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>) +declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>) +declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>) + +; +; fmaximumnum +; + +define float @test_fmaximumnum(float %x, float %y) nounwind { +; SSE2-LABEL: test_fmaximumnum: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: js .LBB0_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: .LBB0_2: +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: cmpordss %xmm3, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: js .LBB0_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB0_4: +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB0_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: jmp .LBB0_3 +; AVX1-NEXT: .LBB0_1: +; AVX1-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: .LBB0_3: +; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm2, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB0_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm2, %xmm1 +; X86-NEXT: jmp .LBB0_3 +; X86-NEXT: .LBB0_1: +; X86-NEXT: vmovdqa %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: .LBB0_3: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = tail call float @llvm.maximumnum.f32(float %x, float %y) + ret float %1 +} + +define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; SSE2-LABEL: test_fmaximumnum_scalarize: +; SSE2: # %bb.0: +; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_scalarize: +; AVX: # %bb.0: +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_scalarize: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_scalarize: +; X86: # %bb.0: +; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %r +} + +define float @test_fmaximumnum_nan0(float %x, float %y) { +; SSE2-LABEL: test_fmaximumnum_nan0: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_nan0: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_nan0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovaps %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_nan0: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: retl + %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y) + ret float %1 +} + +define float @test_fmaximumnum_nan1(float %x, float %y) { +; SSE2-LABEL: test_fmaximumnum_nan1: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_nan1: +; AVX: # %bb.0: +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_nan1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_nan1: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: retl + %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000) + ret float %1 +} + +define float @test_fmaximumnum_nnan(float %x, float %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_nnan: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: addss %xmm1, %xmm2 +; SSE2-NEXT: subss %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: js .LBB4_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: maxss %xmm2, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB4_1: +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_nnan: +; AVX1: # %bb.0: +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB4_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB4_1: +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fmaximumnum_nnan: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: sets %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovaps %xmm2, %xmm1 +; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fmaximumnum_nnan: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0) +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovaps %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512DQ-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_nnan: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX10_2-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: vminmaxss $17, %xmm0, %xmm2 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_nnan: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1 +; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB4_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovaps %xmm1, %xmm2 +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: +; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: .LBB4_3: +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = fadd nnan float %x, %y + %2 = fsub nnan float %x, %y + %3 = tail call float @llvm.maximumnum.f32(float %1, float %2) + ret float %3 +} + +define double @test_fmaximumnum_zero0(double %x, double %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_zero0: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpordsd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm1, %xmm2 +; SSE2-NEXT: xorpd %xmm3, %xmm3 +; SSE2-NEXT: maxsd %xmm3, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_zero0: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum_zero0: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_zero0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX10_2-NEXT: vminmaxsd $17, %xmm0, %xmm1 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_zero0: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y) + ret double %1 +} + +define double @test_fmaximumnum_zero1(double %x, double %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_zero1: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: xorpd %xmm3, %xmm3 +; SSE2-NEXT: maxsd %xmm3, %xmm0 +; SSE2-NEXT: andnpd %xmm0, %xmm1 +; SSE2-NEXT: orpd %xmm2, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_zero1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum_zero1: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vcmpordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_zero1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxsd $17, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_zero1: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %1 = tail call double @llvm.maximumnum.f64(double %x, double 0.0) + ret double %1 +} + +define double @test_fmaximumnum_zero2(double %x, double %y) { +; SSE2-LABEL: test_fmaximumnum_zero2: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_zero2: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_zero2: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_zero2: +; X86: # %bb.0: +; X86-NEXT: fldz +; X86-NEXT: retl + %1 = tail call double @llvm.maximumnum.f64(double 0.0, double -0.0) + ret double %1 +} + +define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind { +; SSE2-LABEL: test_fmaximumnum_nsz: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_nsz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum_nsz: +; AVX512: # %bb.0: +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_nsz: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_nsz: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 +; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = tail call float @llvm.maximumnum.f32(float %x, float %y) + ret float %1 +} + +define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_combine_cmps: +; SSE2: # %bb.0: +; SSE2-NEXT: divss %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: js .LBB9_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: .LBB9_2: +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: cmpordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: js .LBB9_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: .LBB9_4: +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_combine_cmps: +; AVX1: # %bb.0: +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB9_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: jmp .LBB9_3 +; AVX1-NEXT: .LBB9_1: +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: .LBB9_3: +; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fmaximumnum_combine_cmps: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: sets %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovaps %xmm0, %xmm2 +; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0) +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_combine_cmps: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_combine_cmps: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB9_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovaps %xmm1, %xmm2 +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: +; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: .LBB9_3: +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = fdiv nnan float %y, %x + %2 = tail call float @llvm.maximumnum.f32(float %x, float %1) + ret float %2 +} + +; +; fminimumnum +; + +define float @test_fminimumnum(float %x, float %y) nounwind { +; SSE2-LABEL: test_fminimumnum: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: js .LBB10_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: .LBB10_2: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: cmpordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: js .LBB10_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: .LBB10_4: +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimumnum: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB10_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-NEXT: jmp .LBB10_3 +; AVX1-NEXT: .LBB10_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-NEXT: .LBB10_3: +; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimumnum: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovaps %xmm1, %xmm2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $16, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB10_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: jmp .LBB10_3 +; X86-NEXT: .LBB10_1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: .LBB10_3: +; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = tail call float @llvm.minimumnum.f32(float %x, float %y) + ret float %1 +} + +define <2 x double> @test_fminimumnum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; SSE2-LABEL: test_fminimumnum_scalarize: +; SSE2: # %bb.0: +; SSE2-NEXT: minpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_scalarize: +; AVX: # %bb.0: +; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_scalarize: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_scalarize: +; X86: # %bb.0: +; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %r +} + +define float @test_fminimumnum_nan0(float %x, float %y) { +; SSE2-LABEL: test_fminimumnum_nan0: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_nan0: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_nan0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovaps %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_nan0: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: retl + %1 = tail call float @llvm.minimumnum.f32(float 0x7fff000000000000, float %y) + ret float %1 +} + +define float @test_fminimumnum_nan1(float %x, float %y) { +; SSE2-LABEL: test_fminimumnum_nan1: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_nan1: +; AVX: # %bb.0: +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_nan1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_nan1: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: retl + %1 = tail call float @llvm.minimumnum.f32(float %x, float 0x7fff000000000000) + ret float %1 +} + +define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind { +; SSE2-LABEL: test_fminimumnum_nnan: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB14_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: minsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB14_1: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: minsd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimumnum_nnan: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB14_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB14_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vminsd %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fminimumnum_nnan: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: testq %rax, %rax +; AVX512F-NEXT: sets %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovapd %xmm1, %xmm2 +; AVX512F-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vminsd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fminimumnum_nnan: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1) +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovapd %xmm0, %xmm2 +; AVX512DQ-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512DQ-NEXT: vminsd %xmm2, %xmm1, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_nnan: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxsd $16, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_nnan: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vextractps $1, %xmm0, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB14_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovapd %xmm1, %xmm2 +; X86-NEXT: jmp .LBB14_3 +; X86-NEXT: .LBB14_1: +; X86-NEXT: vmovapd %xmm0, %xmm2 +; X86-NEXT: vmovapd %xmm1, %xmm0 +; X86-NEXT: .LBB14_3: +; X86-NEXT: vminsd %xmm2, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %1 = tail call double @llvm.minimumnum.f64(double %x, double %y) + ret double %1 +} + +define double @test_fminimumnum_zero0(double %x, double %y) nounwind { +; SSE2-LABEL: test_fminimumnum_zero0: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpordsd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm1, %xmm2 +; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimumnum_zero0: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimumnum_zero0: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_zero0: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_zero0: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %1 = tail call double @llvm.minimumnum.f64(double -0.0, double %y) + ret double %1 +} + +define double @test_fminimumnum_zero1(double %x, double %y) nounwind { +; SSE2-LABEL: test_fminimumnum_zero1: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andnpd %xmm0, %xmm1 +; SSE2-NEXT: orpd %xmm2, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimumnum_zero1: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimumnum_zero1: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_zero1: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_zero1: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %1 = tail call double @llvm.minimumnum.f64(double %x, double -0.0) + ret double %1 +} + +define double @test_fminimumnum_zero2(double %x, double %y) { +; SSE2-LABEL: test_fminimumnum_zero2: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_zero2: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_zero2: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_zero2: +; X86: # %bb.0: +; X86-NEXT: fldz +; X86-NEXT: fchs +; X86-NEXT: retl + %1 = tail call double @llvm.minimumnum.f64(double -0.0, double 0.0) + ret double %1 +} + +define float @test_fminimumnum_nsz(float %x, float %y) nounwind { +; SSE2-LABEL: test_fminimumnum_nsz: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimumnum_nsz: +; AVX1: # %bb.0: +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimumnum_nsz: +; AVX512: # %bb.0: +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_nsz: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxss $16, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_nsz: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = tail call nsz float @llvm.minimumnum.f32(float %x, float %y) + ret float %1 +} + +define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { +; SSE2-LABEL: test_fminimumnum_combine_cmps: +; SSE2: # %bb.0: +; SSE2-NEXT: divss %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: js .LBB19_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: .LBB19_2: +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: cmpordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: js .LBB19_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: .LBB19_4: +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimumnum_combine_cmps: +; AVX1: # %bb.0: +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB19_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovaps %xmm2, %xmm1 +; AVX1-NEXT: jmp .LBB19_3 +; AVX1-NEXT: .LBB19_1: +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm0 +; AVX1-NEXT: .LBB19_3: +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fminimumnum_combine_cmps: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: sets %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovaps %xmm1, %xmm2 +; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX512F-NEXT: vcmpordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vmovaps %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fminimumnum_combine_cmps: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512DQ-NEXT: vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0) +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_combine_cmps: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxss $16, %xmm1, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_combine_cmps: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB19_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovaps %xmm2, %xmm1 +; X86-NEXT: jmp .LBB19_3 +; X86-NEXT: .LBB19_1: +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: vmovaps %xmm2, %xmm0 +; X86-NEXT: .LBB19_3: +; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %1 = fdiv nnan float %y, %x + %2 = tail call float @llvm.minimumnum.f32(float %x, float %1) + ret float %2 +} + +define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { +; SSE2-LABEL: test_fminimumnum_vector: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: cmpordpd %xmm3, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector: +; AVX: # %bb.0: +; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector: +; X86: # %bb.0: +; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 +; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 +; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %r +} + +define <4 x float> @test_fmaximumnum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; SSE2-LABEL: test_fmaximumnum_vector: +; SSE2: # %bb.0: +; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_vector: +; AVX: # %bb.0: +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_vector: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_vector: +; X86: # %bb.0: +; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %r +} + +define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) { +; SSE2-LABEL: test_fminimumnum_vector_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector_zero: +; AVX: # %bb.0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector_zero: +; X86: # %bb.0: +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>) + ret <2 x double> %r +} + +define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { +; SSE2-LABEL: test_fmaximumnum_vector_signed_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE2-NEXT: maxps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_vector_signed_zero: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_vector_signed_zero: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>) + ret <4 x float> %r +} + +define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { +; SSE2-LABEL: test_fminimumnum_vector_partially_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector_partially_zero: +; AVX: # %bb.0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector_partially_zero: +; X86: # %bb.0: +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>) + ret <2 x double> %r +} + +define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { +; SSE2-LABEL: test_fminimumnum_vector_different_zeros: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: orps %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: minpd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: cmpordpd %xmm3, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector_different_zeros: +; AVX: # %bb.0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector_different_zeros: +; X86: # %bb.0: +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 +; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 +; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>) + ret <2 x double> %r +} + +define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) { +; SSE2-LABEL: test_fmaximumnum_vector_non_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] +; SSE2-NEXT: maxps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_vector_non_zero: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_vector_non_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_vector_non_zero: +; X86: # %bb.0: +; X86-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>) + ret <4 x float> %r +} + +define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { +; SSE2-LABEL: test_fminimumnum_vector_nan: +; SSE2: # %bb.0: +; SSE2-NEXT: xorpd %xmm2, %xmm2 +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector_nan: +; AVX: # %bb.0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0] +; AVX-NEXT: vminpd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector_nan: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector_nan: +; X86: # %bb.0: +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vcmpordpd %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>) + ret <2 x double> %r +} + +define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) { +; SSE2-LABEL: test_fminimumnum_vector_zero_first: +; SSE2: # %bb.0: +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: minpd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector_zero_first: +; AVX: # %bb.0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector_zero_first: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector_zero_first: +; X86: # %bb.0: +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x) + ret <2 x double> %r +} + +define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) { +; SSE2-LABEL: test_fminimumnum_vector_signed_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpordpd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_vector_signed_zero: +; AVX: # %bb.0: +; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxpd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_vector_signed_zero: +; X86: # %bb.0: +; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>) + ret <2 x double> %r +} + +define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) { +; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE2-NEXT: maxps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_vector_signed_zero_first: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x) + ret <4 x float> %r +} + +define <4 x float> @test_fmaximumnum_vector_zero(<4 x float> %x) { +; SSE2-LABEL: test_fmaximumnum_vector_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxps %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpordps %xmm0, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_vector_zero: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_vector_zero: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_vector_zero: +; X86: # %bb.0: +; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>) + ret <4 x float> %r +} + +; PR77805: Check that signed zeroes are handled correctly in this case (FIXME) +define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { +; SSE2-LABEL: test_fmaximumnum_v4f32_splat: +; SSE2: # %bb.0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm4 +; SSE2-NEXT: orps %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: maxps %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: cmpordps %xmm0, %xmm2 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_v4f32_splat: +; AVX1: # %bb.0: +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum_v4f32_splat: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_v4f32_splat: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: retl + %splatinsert = insertelement <4 x float> poison, float %y, i64 0 + %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %vec) readnone + ret <4 x float> %r +} + +define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_v4f16: +; SSE2: # %bb.0: +; SSE2-NEXT: subq $104, %rsp +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: js .LBB33_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: .LBB33_2: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: js .LBB33_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: .LBB33_4: +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: maxss %xmm4, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: js .LBB33_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: .LBB33_6: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: js .LBB33_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: .LBB33_8: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: maxss %xmm4, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: js .LBB33_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: .LBB33_10: +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: cmpordss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: js .LBB33_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: .LBB33_12: +; SSE2-NEXT: maxss %xmm4, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movd (%rsp), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: js .LBB33_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: .LBB33_14: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: js .LBB33_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: .LBB33_16: +; SSE2-NEXT: maxss %xmm4, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: addq $104, %rsp +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_v4f16: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $120, %rsp +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB33_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: jmp .LBB33_3 +; AVX1-NEXT: .LBB33_1: +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: .LBB33_3: +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB33_4 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-NEXT: jmp .LBB33_6 +; AVX1-NEXT: .LBB33_4: +; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: .LBB33_6: +; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB33_7 +; AVX1-NEXT: # %bb.8: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-NEXT: jmp .LBB33_9 +; AVX1-NEXT: .LBB33_7: +; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: .LBB33_9: +; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB33_10 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: jmp .LBB33_12 +; AVX1-NEXT: .LBB33_10: +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: .LBB33_12: +; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX1-NEXT: addq $120, %rsp +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum_v4f16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqa %xmm1, %xmm4 +; AVX512-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm2 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm9 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vxorps %xmm10, %xmm10, %xmm10 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3 +; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm3 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5 +; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm5 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm4[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: vucomiss %xmm5, %xmm5 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm15 +; AVX512-NEXT: vcvtph2ps %xmm15, %xmm5 +; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm5 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmulss %xmm3, %xmm9, %xmm3 +; AVX512-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm11 +; AVX512-NEXT: vcvtph2ps %xmm11, %xmm3 +; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm3 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm7 +; AVX512-NEXT: vcvtph2ps %xmm7, %xmm3 +; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm12 +; AVX512-NEXT: vcvtph2ps %xmm12, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm3 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm14 +; AVX512-NEXT: vmovd %xmm14, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13 +; AVX512-NEXT: vmovd %xmm13, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vcvtph2ps %xmm6, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm8 +; AVX512-NEXT: vcvtph2ps %xmm8, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7] +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vucomiss %xmm4, %xmm4 +; AVX512-NEXT: setp %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovss %xmm1, %xmm4, %xmm4 {%k2} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm6, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm0 +; AVX512-NEXT: vucomiss %xmm0, %xmm6 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3] +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm9 +; AVX512-NEXT: vmovd %xmm9, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm10 +; AVX512-NEXT: vmovd %xmm10, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX512-NEXT: # xmm6 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovd %xmm15, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512-NEXT: vmovd %xmm11, %eax +; AVX512-NEXT: vmovd %xmm7, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: vmovd %xmm4, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm3 +; AVX512-NEXT: vpblendvb %xmm3, %xmm2, %xmm6, %xmm2 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm3, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm4, %ecx +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 +; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 +; AVX512-NEXT: vmovd %xmm12, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vmovd %xmm8, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX512-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm10, %xmm1 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovel %ecx, %edx +; AVX512-NEXT: vcvtph2ps %xmm9, %xmm1 +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: cmovel %ecx, %esi +; AVX512-NEXT: vcvtph2ps %xmm13, %xmm1 +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: cmovel %ecx, %edi +; AVX512-NEXT: vcvtph2ps %xmm14, %xmm1 +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $0, %r8d +; AVX512-NEXT: cmovel %ecx, %r8d +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $0, %r9d +; AVX512-NEXT: cmovel %ecx, %r9d +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: cmovel %ecx, %r10d +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: cmovel %ecx, %r11d +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: vucomiss %xmm2, %xmm1 +; AVX512-NEXT: vmovd %esi, %xmm1 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $6, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: cmovel %ecx, %eax +; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vpblendvb %xmm1, %xmm0, %xmm6, %xmm0 +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_v4f16: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxph $17, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_v4f16: +; X86: # %bb.0: +; X86-NEXT: subl $164, %esp +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpsrlq $48, %xmm1, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpsrld $16, %xmm2, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpsrld $16, %xmm1, %xmm0 +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpextrw $0, %xmm1, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm2, %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB33_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm2, %xmm1 +; X86-NEXT: jmp .LBB33_3 +; X86-NEXT: .LBB33_1: +; X86-NEXT: vmovdqa %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: .LBB33_3: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB33_4 +; X86-NEXT: # %bb.5: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: jmp .LBB33_6 +; X86-NEXT: .LBB33_4: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: .LBB33_6: +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrw $0, %xmm0, (%esp) +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB33_7 +; X86-NEXT: # %bb.8: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: jmp .LBB33_9 +; X86-NEXT: .LBB33_7: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: .LBB33_9: +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: testl %eax, %eax +; X86-NEXT: js .LBB33_10 +; X86-NEXT: # %bb.11: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: jmp .LBB33_12 +; X86-NEXT: .LBB33_10: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: .LBB33_12: +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovd %xmm0, (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; X86-NEXT: addl $164, %esp +; X86-NEXT: retl + %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y) + ret <4 x half> %r +} + +define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pextrw $0, %xmm1, %r14d +; SSE2-NEXT: pextrw $0, %xmm0, %r15d +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %ecx +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: testl %ecx, %ecx +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: js .LBB34_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB34_2: +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE2-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1,1,1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: cmpordss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm6 +; SSE2-NEXT: andps %xmm1, %xmm6 +; SSE2-NEXT: js .LBB34_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: .LBB34_4: +; SSE2-NEXT: pextrw $0, %xmm4, %ebp +; SSE2-NEXT: pextrw $0, %xmm5, %ebx +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm6, %xmm0 +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: shll $16, %r14d +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: testl %r15d, %r15d +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: js .LBB34_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: .LBB34_6: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm5 +; SSE2-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: psrlq $48, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: cmpordss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: andps %xmm1, %xmm4 +; SSE2-NEXT: js .LBB34_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: .LBB34_8: +; SSE2-NEXT: pextrw $0, %xmm5, %r15d +; SSE2-NEXT: pextrw $0, %xmm6, %r14d +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: shll $16, %ebp +; SSE2-NEXT: movd %ebp, %xmm3 +; SSE2-NEXT: testl %ebx, %ebx +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: js .LBB34_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: .LBB34_10: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm4 +; SSE2-NEXT: js .LBB34_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: .LBB34_12: +; SSE2-NEXT: maxss %xmm3, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE2-NEXT: shll $16, %r14d +; SSE2-NEXT: movd %r14d, %xmm1 +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: testl %r14d, %r14d +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: js .LBB34_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: .LBB34_14: +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: cmpordss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm4 +; SSE2-NEXT: js .LBB34_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: .LBB34_16: +; SSE2-NEXT: maxss %xmm3, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: callq __truncsfbf2@PLT +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximumnum_v4bf16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX1-NEXT: vpextrw $0, %xmm4, %ebx +; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX1-NEXT: vpextrw $0, %xmm4, %ebp +; AVX1-NEXT: vpextrw $0, %xmm0, %r12d +; AVX1-NEXT: vpextrw $0, %xmm1, %r13d +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpextrw $0, %xmm0, %eax +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-NEXT: vpextrw $0, %xmm0, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: js .LBB34_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-NEXT: jmp .LBB34_3 +; AVX1-NEXT: .LBB34_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-NEXT: .LBB34_3: +; AVX1-NEXT: vpextrw $0, %xmm2, %r14d +; AVX1-NEXT: vpextrw $0, %xmm3, %r15d +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfbf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: shll $16, %r13d +; AVX1-NEXT: vmovd %r13d, %xmm0 +; AVX1-NEXT: shll $16, %r12d +; AVX1-NEXT: vmovd %r12d, %xmm2 +; AVX1-NEXT: js .LBB34_4 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: vmovdqa %xmm2, %xmm1 +; AVX1-NEXT: jmp .LBB34_6 +; AVX1-NEXT: .LBB34_4: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: .LBB34_6: +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfbf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: shll $16, %ebp +; AVX1-NEXT: vmovd %ebp, %xmm0 +; AVX1-NEXT: shll $16, %ebx +; AVX1-NEXT: vmovd %ebx, %xmm2 +; AVX1-NEXT: js .LBB34_7 +; AVX1-NEXT: # %bb.8: +; AVX1-NEXT: vmovdqa %xmm2, %xmm1 +; AVX1-NEXT: jmp .LBB34_9 +; AVX1-NEXT: .LBB34_7: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: .LBB34_9: +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfbf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: shll $16, %r15d +; AVX1-NEXT: vmovd %r15d, %xmm0 +; AVX1-NEXT: shll $16, %r14d +; AVX1-NEXT: vmovd %r14d, %xmm2 +; AVX1-NEXT: js .LBB34_10 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: vmovdqa %xmm2, %xmm1 +; AVX1-NEXT: jmp .LBB34_12 +; AVX1-NEXT: .LBB34_10: +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: .LBB34_12: +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfbf2@PLT +; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximumnum_v4bf16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vmovq %xmm1, %r13 +; AVX512-NEXT: movq %r13, %rbx +; AVX512-NEXT: shrq $32, %rbx +; AVX512-NEXT: vmovq %xmm0, %rbp +; AVX512-NEXT: movq %rbp, %r14 +; AVX512-NEXT: shrq $32, %r14 +; AVX512-NEXT: movq %r13, %r15 +; AVX512-NEXT: shrq $48, %r15 +; AVX512-NEXT: movq %rbp, %r12 +; AVX512-NEXT: shrq $48, %r12 +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; AVX512-NEXT: sets %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: movl %r13d, %ecx +; AVX512-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; AVX512-NEXT: vmovd %ecx, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: callq __truncsfbf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: shll $16, %r13d +; AVX512-NEXT: vmovd %r13d, %xmm1 +; AVX512-NEXT: vmovd %ebp, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: callq __truncsfbf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX512-NEXT: shll $16, %r12d +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: shll $16, %r15d +; AVX512-NEXT: vmovd %r15d, %xmm1 +; AVX512-NEXT: vmovd %r12d, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: callq __truncsfbf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: shll $16, %r14d +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: shll $16, %ebx +; AVX512-NEXT: vmovd %ebx, %xmm1 +; AVX512-NEXT: vmovd %r14d, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: callq __truncsfbf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: addq $8, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_v4bf16: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: vminmaxnepbf16 $17, %xmm1, %xmm0, %xmm0 +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_v4bf16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $68, %esp +; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 +; X86-NEXT: vpsrlq $48, %xmm1, %xmm3 +; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X86-NEXT: vpextrw $0, %xmm4, %esi +; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; X86-NEXT: vpextrw $0, %xmm4, %ebx +; X86-NEXT: vpextrw $0, %xmm0, %eax +; X86-NEXT: vpextrw $0, %xmm1, %ecx +; X86-NEXT: vpsrld $16, %xmm0, %xmm0 +; X86-NEXT: vpextrw $0, %xmm0, %edx +; X86-NEXT: vpsrld $16, %xmm1, %xmm0 +; X86-NEXT: vpextrw $0, %xmm0, %edi +; X86-NEXT: shll $16, %edi +; X86-NEXT: vmovd %edi, %xmm0 +; X86-NEXT: shll $16, %edx +; X86-NEXT: vmovd %edx, %xmm4 +; X86-NEXT: js .LBB34_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm4, %xmm1 +; X86-NEXT: jmp .LBB34_3 +; X86-NEXT: .LBB34_1: +; X86-NEXT: vmovdqa %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm4, %xmm0 +; X86-NEXT: .LBB34_3: +; X86-NEXT: vpextrw $0, %xmm2, %edi +; X86-NEXT: vpextrw $0, %xmm3, %ebp +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: shll $16, %ecx +; X86-NEXT: vmovd %ecx, %xmm0 +; X86-NEXT: shll $16, %eax +; X86-NEXT: vmovd %eax, %xmm2 +; X86-NEXT: js .LBB34_4 +; X86-NEXT: # %bb.5: +; X86-NEXT: vmovdqa %xmm2, %xmm1 +; X86-NEXT: jmp .LBB34_6 +; X86-NEXT: .LBB34_4: +; X86-NEXT: vmovdqa %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: .LBB34_6: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __truncsfbf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: shll $16, %ebx +; X86-NEXT: vmovd %ebx, %xmm0 +; X86-NEXT: shll $16, %esi +; X86-NEXT: vmovd %esi, %xmm2 +; X86-NEXT: js .LBB34_7 +; X86-NEXT: # %bb.8: +; X86-NEXT: vmovdqa %xmm2, %xmm1 +; X86-NEXT: jmp .LBB34_9 +; X86-NEXT: .LBB34_7: +; X86-NEXT: vmovdqa %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: .LBB34_9: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __truncsfbf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: shll $16, %ebp +; X86-NEXT: vmovd %ebp, %xmm0 +; X86-NEXT: shll $16, %edi +; X86-NEXT: vmovd %edi, %xmm2 +; X86-NEXT: js .LBB34_10 +; X86-NEXT: # %bb.11: +; X86-NEXT: vmovdqa %xmm2, %xmm1 +; X86-NEXT: jmp .LBB34_12 +; X86-NEXT: .LBB34_10: +; X86-NEXT: vmovdqa %xmm0, %xmm1 +; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: .LBB34_12: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: calll __truncsfbf2 +; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vmovd %xmm0, (%esp) +; X86-NEXT: calll __truncsfbf2 +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; X86-NEXT: addl $68, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) + ret <4 x bfloat> %r +} diff --git a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll index 95432380ced7..55d9ea90682d 100644 --- a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll +++ b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 -define void @autogen_SD13708(i32) { +define void @autogen_SD13708(i32, i1 %arg) { BB: %Shuff7 = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 undef, i32 2, i32 4, i32 undef> br label %CF @@ -8,11 +8,11 @@ BB: CF: %Tr = trunc <8 x i64> zeroinitializer to <8 x i32> %Shuff20 = shufflevector <8 x i32> %Shuff7, <8 x i32> %Tr, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 11> - br i1 undef, label %CF, label %CF247 + br i1 %arg, label %CF, label %CF247 CF247: %I171 = insertelement <8 x i32> %Shuff20, i32 %0, i32 0 - br i1 undef, label %CF, label %CF247 + br i1 %arg, label %CF, label %CF247 } define void @autogen_SD13800(ptr, ptr, ptr, i32, i64, i8) { diff --git a/llvm/test/CodeGen/X86/hoist-spill.ll b/llvm/test/CodeGen/X86/hoist-spill.ll index d11b6666442b..b51609c313b0 100644 --- a/llvm/test/CodeGen/X86/hoist-spill.ll +++ b/llvm/test/CodeGen/X86/hoist-spill.ll @@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu" @d = external global ptr, align 8 ; Function Attrs: norecurse noreturn nounwind uwtable -define void @fn1(i32 %p1, i32 %p2, i64 %p3) { +define void @fn1(i32 %p1, i32 %p2, i64 %p3, i1 %arg) { entry: %tmp = load ptr, ptr @d, align 8 %tmp1 = load ptr, ptr @a, align 8 @@ -54,10 +54,10 @@ for.cond4.preheader: ; preds = %for.body, %for.cond br i1 %cmp528, label %for.inc14, label %for.body6.preheader for.body6.preheader: ; preds = %for.cond4.preheader - br i1 undef, label %for.body6, label %min.iters.checked + br i1 %arg, label %for.body6, label %min.iters.checked min.iters.checked: ; preds = %for.body6.preheader - br i1 undef, label %for.body6, label %vector.memcheck + br i1 %arg, label %for.body6, label %vector.memcheck vector.memcheck: ; preds = %min.iters.checked %bound1 = icmp ule ptr undef, %scevgep41 @@ -85,10 +85,10 @@ vector.body: ; preds = %vector.body, %vecto %tmp16 = getelementptr inbounds i32, ptr %tmp1, i64 %offset.idx.1 store <4 x i32> %wide.load.1, ptr %tmp16, align 4 %index.next.3 = add i64 %index, 32 - br i1 undef, label %middle.block, label %vector.body + br i1 %arg, label %middle.block, label %vector.body middle.block: ; preds = %vector.body, %vector.body.preheader.split - br i1 undef, label %for.inc14, label %for.body6 + br i1 %arg, label %for.inc14, label %for.body6 for.body.preheader: ; preds = %for.cond br label %for.body @@ -98,7 +98,7 @@ for.body: ; preds = %for.body, %for.body %add = add nsw i32 %k.127, 1 %tmp18 = load i32, ptr undef, align 4 store i32 %tmp18, ptr @b, align 4 - br i1 undef, label %for.body, label %for.cond4.preheader + br i1 %arg, label %for.body, label %for.cond4.preheader for.body6: ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ] diff --git a/llvm/test/CodeGen/X86/implicit-null-checks.mir b/llvm/test/CodeGen/X86/implicit-null-checks.mir index 0077906b6018..c98019c09a1e 100644 --- a/llvm/test/CodeGen/X86/implicit-null-checks.mir +++ b/llvm/test/CodeGen/X86/implicit-null-checks.mir @@ -5,15 +5,15 @@ target triple = "x86_64-apple-macosx" ;; Positive test - define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val) { + define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val, i1 %arg) { entry: - br i1 undef, label %is_null, label %not_null, !make.implicit !0 + br i1 %arg, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 not_null: - br i1 undef, label %ret_100, label %ret_200 + br i1 %arg, label %ret_100, label %ret_200 ret_100: ret i32 100 @@ -24,15 +24,15 @@ ;; Negative test. The regalloc is such that we cannot hoist the ;; instruction materializing 2200000 into $eax - define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr) { + define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr, i1 %arg) { entry: - br i1 undef, label %is_null, label %not_null, !make.implicit !0 + br i1 %arg, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 undef not_null: - br i1 undef, label %ret_100, label %ret_200 + br i1 %arg, label %ret_100, label %ret_200 ret_100: ret i32 100 @@ -43,15 +43,15 @@ ;; Negative test: IR is identical to ;; @imp_null_check_with_bitwise_op_0 but MIR differs. - define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val) { + define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val, i1 %arg) { entry: - br i1 undef, label %is_null, label %not_null, !make.implicit !0 + br i1 %arg, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 not_null: - br i1 undef, label %ret_100, label %ret_200 + br i1 %arg, label %ret_100, label %ret_200 ret_100: ret i32 100 @@ -62,15 +62,15 @@ ;; Negative test: IR is identical to ;; @imp_null_check_with_bitwise_op_0 but MIR differs. - define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val) { + define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val, i1 %arg) { entry: - br i1 undef, label %is_null, label %not_null, !make.implicit !0 + br i1 %arg, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 not_null: - br i1 undef, label %ret_100, label %ret_200 + br i1 %arg, label %ret_100, label %ret_200 ret_100: ret i32 100 @@ -80,15 +80,15 @@ } ;; Positive test - define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val) { + define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val, i1 %arg) { entry: - br i1 undef, label %is_null, label %not_null, !make.implicit !0 + br i1 %arg, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 not_null: - br i1 undef, label %ret_100, label %ret_200 + br i1 %arg, label %ret_100, label %ret_200 ret_100: ret i32 100 diff --git a/llvm/test/CodeGen/X86/interval-update-remat.ll b/llvm/test/CodeGen/X86/interval-update-remat.ll index 44d3db3a2972..91fde2ba018b 100644 --- a/llvm/test/CodeGen/X86/interval-update-remat.ll +++ b/llvm/test/CodeGen/X86/interval-update-remat.ll @@ -17,13 +17,13 @@ target triple = "i386-unknown-linux-gnu" @f = external global i16, align 2 @.str = external unnamed_addr constant [12 x i8], align 1 -define void @fn1() { +define void @fn1(i1 %arg) { entry: %tmp = load i64, ptr @b, align 8 %or = or i64 0, 3299921317 %and = and i64 %or, %tmp %tmp1 = load i32, ptr @d, align 4 - br i1 undef, label %lor.rhs, label %lor.end + br i1 %arg, label %lor.rhs, label %lor.end lor.rhs: ; preds = %entry %tobool3 = icmp ne i8 undef, 0 @@ -32,7 +32,7 @@ lor.rhs: ; preds = %entry lor.end: ; preds = %lor.rhs, %entry %lor.ext = zext i1 undef to i32 %tmp2 = load i64, ptr @e, align 8 - br i1 undef, label %lor.rhs5, label %lor.end7 + br i1 %arg, label %lor.rhs5, label %lor.end7 lor.rhs5: ; preds = %lor.end br label %lor.end7 diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll index 9eaa65442a72..6dc0427b02f3 100644 --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -249,16 +249,16 @@ define void @func_o() nounwind uwtable { ; CHECK-NEXT: .LBB12_7: # %if.else.i97 entry: %0 = load i16, ptr undef, align 2 - br i1 undef, label %if.then.i, label %if.end.i + br i1 poison, label %if.then.i, label %if.end.i if.then.i: ; preds = %entry unreachable if.end.i: ; preds = %entry - br i1 undef, label %sw.bb, label %sw.default + br i1 poison, label %sw.bb, label %sw.default sw.bb: ; preds = %if.end.i - br i1 undef, label %if.then44, label %if.end29 + br i1 poison, label %if.then44, label %if.end29 if.end29: ; preds = %sw.bb %1 = urem i16 %0, 10 @@ -267,7 +267,7 @@ if.end29: ; preds = %sw.bb br i1 %cmp25, label %if.then44, label %sw.default sw.default: ; preds = %if.end29, %if.end.i - br i1 undef, label %if.then.i96, label %if.else.i97 + br i1 poison, label %if.then.i96, label %if.else.i97 if.then.i96: ; preds = %sw.default unreachable @@ -277,7 +277,7 @@ if.else.i97: ; preds = %sw.default if.then44: ; preds = %if.end29, %sw.bb %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ] - br i1 undef, label %if.then.i103, label %if.else.i104 + br i1 poison, label %if.then.i103, label %if.else.i104 if.then.i103: ; preds = %if.then44 unreachable @@ -420,4 +420,3 @@ if.end: } !1 = !{!"branch_weights", i32 2, i32 1} - diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll index a00433391f15..9cd755119e7a 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll @@ -7,7 +7,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.12.0" -define void @foo() { +define void @foo(i1 %arg) { entry: br label %for @@ -17,7 +17,7 @@ for: store i32 %next, ptr undef, align 4 %add = add i64 %0, 9223372036854775807 %inc = add nsw i32 %next, 1 - br i1 undef, label %exit, label %for + br i1 %arg, label %exit, label %for exit: store i64 %add, ptr undef diff --git a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll index 552999fdba65..cf434419bb97 100644 --- a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll +++ b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll @@ -3,7 +3,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32-S128" target triple = "x86_64-unknown-linux-gnu" ; CHECK-LABEL: @hoge -define void @hoge() { +define void @hoge(i1 %arg) { bb: %tmp = sext i32 undef to i64 %tmp3 = sub nsw i64 0, %tmp @@ -21,7 +21,7 @@ bb7: ; preds = %bb7, %bb4 br i1 true, label %bb11, label %bb7 bb11: ; preds = %bb7 - br i1 undef, label %bb20, label %bb12 + br i1 %arg, label %bb20, label %bb12 bb12: ; preds = %bb11 br label %bb13 diff --git a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll index efa9331cfcc4..a35015d09a4f 100644 --- a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll +++ b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll @@ -30,7 +30,7 @@ bb24: ; preds = %bb21, %bb11 ; ScalarEvolution should be able to correctly expand the crazy addrec here. ; PR6914 -define void @int323() nounwind { +define void @int323(i1 %arg) nounwind { entry: br label %for.cond @@ -38,7 +38,7 @@ for.cond: ; preds = %lbl_264, %for.inc, %g_263.tmp.1 = phi i8 [ undef, %entry ], [ %g_263.tmp.1, %for.cond ] %p_95.addr.0 = phi i8 [ 0, %entry ], [ %add, %for.cond ] %add = add i8 %p_95.addr.0, 1 ; <i8> [#uses=1] - br i1 undef, label %for.cond, label %lbl_264 + br i1 %arg, label %for.cond, label %lbl_264 lbl_264: ; preds = %if.end, %lbl_264.preheader %g_263.tmp.0 = phi i8 [ %g_263.tmp.1, %for.cond ] ; <i8> [#uses=1] @@ -56,13 +56,13 @@ lbl_264: ; preds = %if.end, %lbl_264.pr %struct.Bu = type { i32, i32, i32 } -define void @_Z3fooP2Bui(ptr nocapture %bu) { +define void @_Z3fooP2Bui(ptr nocapture %bu, i1 %arg) { entry: br label %for.body for.body: ; preds = %for.inc131, %entry %indvar = phi i64 [ %indvar.next, %for.inc131 ], [ 0, %entry ] ; <i64> [#uses=3] - br i1 undef, label %for.inc131, label %lor.lhs.false + br i1 %arg, label %for.inc131, label %lor.lhs.false lor.lhs.false: ; preds = %for.body %tmp15 = add i64 %indvar, 1 ; <i64> [#uses=1] @@ -123,11 +123,11 @@ for.body123: ; preds = %for.body123, %lor.l %add129 = add i32 %mul, %j.03 ; <i32> [#uses=1] tail call void undef(i32 %add129) %inc = add nsw i32 %j.03, 1 ; <i32> [#uses=1] - br i1 undef, label %for.inc131, label %for.body123 + br i1 %arg, label %for.inc131, label %for.body123 for.inc131: ; preds = %for.body123, %for.body %indvar.next = add i64 %indvar, 1 ; <i64> [#uses=1] - br i1 undef, label %for.end134, label %for.body + br i1 %arg, label %for.end134, label %for.body for.end134: ; preds = %for.inc131 ret void @@ -138,14 +138,14 @@ for.end134: ; preds = %for.inc131 ; require insert point adjustment. ; PR7306 -define fastcc i32 @GetOptimum() nounwind { +define fastcc i32 @GetOptimum(i1 %arg) nounwind { bb: br label %bb1 bb1: ; preds = %bb1, %bb %t = phi i32 [ 0, %bb ], [ %t2, %bb1 ] ; <i32> [#uses=1] %t2 = add i32 %t, undef ; <i32> [#uses=3] - br i1 undef, label %bb1, label %bb3 + br i1 %arg, label %bb1, label %bb3 bb3: ; preds = %bb1 %t4 = add i32 undef, -1 ; <i32> [#uses=1] @@ -155,13 +155,13 @@ bb5: ; preds = %bb16, %bb3 %t6 = phi i32 [ %t17, %bb16 ], [ 0, %bb3 ] ; <i32> [#uses=3] %t7 = add i32 undef, %t6 ; <i32> [#uses=2] %t8 = add i32 %t4, %t6 ; <i32> [#uses=1] - br i1 undef, label %bb9, label %bb10 + br i1 %arg, label %bb9, label %bb10 bb9: ; preds = %bb5 br label %bb10 bb10: ; preds = %bb9, %bb5 - br i1 undef, label %bb11, label %bb16 + br i1 %arg, label %bb11, label %bb16 bb11: ; preds = %bb10 %t12 = icmp ugt i32 %t7, %t2 ; <i1> [#uses=1] diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll index 5828f06bf1c3..41eae3ca03c2 100644 --- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll +++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll @@ -52,7 +52,7 @@ define void @PR24199(i32 %a0) { entry: %i = alloca %struct.A, align 8 %tobool = icmp ne i32 %a0, 0 - br i1 undef, label %if.end, label %if.then + br i1 poison, label %if.end, label %if.then if.then: br label %if.end @@ -96,5 +96,3 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) !4 = !DIExpression() !5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer) !6 = !DILocation(line: 0, scope: !3) - - diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll index ee5fd78c6437..62935f7e372b 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -193,13 +193,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll index a46f9ed3d379..9bbd335a903b 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -179,14 +179,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind { define i32 @length4(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -391,14 +391,14 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { define i32 @length8(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll index 4a9643c0f4fc..3a16ab656b11 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll index 4e27301436c3..0f817b2c727c 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize { define i32 @length4(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize { define i32 @length8(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll index bdb50f5b60c4..35fd373536bd 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll index 9347e5422022..f63885292318 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 { define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 { define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll index ad9f2a30d75b..4a3f5a608e58 100644 --- a/llvm/test/CodeGen/X86/memcmp-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -221,13 +221,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind ret i32 %m diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 8fe1a581cd9c..bb089e5ddda8 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -205,14 +205,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind { define i32 @length4(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length4: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind ret i32 %m @@ -260,6 +260,36 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind { ret i1 %c } +define i1 @length4_le(ptr %X, ptr %Y) nounwind { +; X64-LABEL: length4_le: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: setbe %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp slt i32 %m, 1 + ret i1 %c +} + +define i1 @length4_ge(ptr %X, ptr %Y) nounwind { +; X64-LABEL: length4_ge: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: setae %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind + %c = icmp sgt i32 %m, -1 + ret i1 %c +} + define i1 @length4_eq_const(ptr %X) nounwind { ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: @@ -279,13 +309,13 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB18_3 +; X64-NEXT: jne .LBB20_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB18_3: # %res_block +; X64-NEXT: .LBB20_3: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax @@ -319,7 +349,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB20_3 +; X64-NEXT: jne .LBB22_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx @@ -327,7 +357,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB20_3: # %res_block +; X64-NEXT: .LBB22_3: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax @@ -348,7 +378,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB21_2 +; X64-NEXT: jne .LBB23_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 3(%rdi), %ecx ; X64-NEXT: movl 3(%rsi), %edx @@ -356,13 +386,13 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: je .LBB21_3 -; X64-NEXT: .LBB21_2: # %res_block +; X64-NEXT: je .LBB23_3 +; X64-NEXT: .LBB23_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB21_3: # %endblock +; X64-NEXT: .LBB23_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind ret i32 %m @@ -376,7 +406,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: jne .LBB22_2 +; X64-NEXT: jne .LBB24_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 3(%rdi), %ecx ; X64-NEXT: movl 3(%rsi), %edx @@ -384,13 +414,13 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: je .LBB22_3 -; X64-NEXT: .LBB22_2: # %res_block +; X64-NEXT: je .LBB24_3 +; X64-NEXT: .LBB24_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB22_3: # %endblock +; X64-NEXT: .LBB24_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -417,14 +447,14 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind { define i32 @length8(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length8: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind ret i32 %m @@ -524,7 +554,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB31_2 +; X64-NEXT: jne .LBB33_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx @@ -532,13 +562,13 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB31_3 -; X64-NEXT: .LBB31_2: # %res_block +; X64-NEXT: je .LBB33_3 +; X64-NEXT: .LBB33_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB31_3: # %endblock +; X64-NEXT: .LBB33_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind ret i32 %m @@ -582,7 +612,7 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB34_2 +; X64-NEXT: jne .LBB36_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: movq 7(%rsi), %rdx @@ -590,13 +620,13 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB34_3 -; X64-NEXT: .LBB34_2: # %res_block +; X64-NEXT: je .LBB36_3 +; X64-NEXT: .LBB36_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB34_3: # %endblock +; X64-NEXT: .LBB36_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind ret i32 %m @@ -610,7 +640,7 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB35_2 +; X64-NEXT: jne .LBB37_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: movq 7(%rsi), %rdx @@ -618,13 +648,13 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB35_3 -; X64-NEXT: .LBB35_2: # %res_block +; X64-NEXT: je .LBB37_3 +; X64-NEXT: .LBB37_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB35_3: # %endblock +; X64-NEXT: .LBB37_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -640,20 +670,20 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: movq (%rdi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: jne .LBB36_2 +; X64-NEXT: jne .LBB38_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435 ; X64-NEXT: movq 7(%rdi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: je .LBB36_3 -; X64-NEXT: .LBB36_2: # %res_block +; X64-NEXT: je .LBB38_3 +; X64-NEXT: .LBB38_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB36_3: # %endblock +; X64-NEXT: .LBB38_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind ret i32 %m @@ -681,20 +711,20 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: jne .LBB38_2 +; X64-NEXT: jne .LBB40_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movabsq $4051322327650219061, %rax # imm = 0x3839303132333435 ; X64-NEXT: movq 7(%rdi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: je .LBB38_3 -; X64-NEXT: .LBB38_2: # %res_block +; X64-NEXT: je .LBB40_3 +; X64-NEXT: .LBB40_2: # %res_block ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx ; X64-NEXT: sbbl %edx, %edx ; X64-NEXT: orl $1, %edx -; X64-NEXT: .LBB38_3: # %endblock +; X64-NEXT: .LBB40_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq @@ -713,7 +743,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB39_2 +; X64-NEXT: jne .LBB41_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -721,13 +751,13 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB39_3 -; X64-NEXT: .LBB39_2: # %res_block +; X64-NEXT: je .LBB41_3 +; X64-NEXT: .LBB41_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB39_3: # %endblock +; X64-NEXT: .LBB41_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind ret i32 %m @@ -783,7 +813,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB41_2 +; X64-NEXT: jne .LBB43_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -791,13 +821,13 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB41_3 -; X64-NEXT: .LBB41_2: # %res_block +; X64-NEXT: je .LBB43_3 +; X64-NEXT: .LBB43_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: sbbl %eax, %eax ; X64-NEXT: orl $1, %eax -; X64-NEXT: .LBB41_3: # %endblock +; X64-NEXT: .LBB43_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -814,7 +844,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: jne .LBB42_2 +; X64-NEXT: jne .LBB44_2 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rax ; X64-NEXT: movq 8(%rsi), %rcx @@ -822,13 +852,13 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: je .LBB42_3 -; X64-NEXT: .LBB42_2: # %res_block +; X64-NEXT: je .LBB44_3 +; X64-NEXT: .LBB44_2: # %res_block ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: sbbl %edx, %edx ; X64-NEXT: orl $1, %edx -; X64-NEXT: .LBB42_3: # %endblock +; X64-NEXT: .LBB44_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll index a10fbc10bf28..3dba5eb15d67 100644 --- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll +++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll @@ -21,10 +21,10 @@ define void @testfn(ptr nocapture %p) { ; CHECK-LABEL: testfn_scalar ; CHECK: retq -define void @testfn_scalar(ptr nocapture %j) local_unnamed_addr #0 align 2 { +define void @testfn_scalar(ptr nocapture %j, i1 %arg) local_unnamed_addr #0 align 2 { entry: %0 = bitcast i64 undef to <2 x float> - br i1 undef, label %if.end, label %if.then + br i1 %arg, label %if.end, label %if.then if.then: ; preds = %entry unreachable diff --git a/llvm/test/CodeGen/X86/mingw-refptr.ll b/llvm/test/CodeGen/X86/mingw-refptr.ll index 73f1a9880913..82a90aba3865 100644 --- a/llvm/test/CodeGen/X86/mingw-refptr.ll +++ b/llvm/test/CodeGen/X86/mingw-refptr.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X64 +; RUN: llc < %s -mtriple=x86_64-pc-cygwin | FileCheck %s -check-prefix=CHECK-X64 ; RUN: llc < %s -mtriple=i686-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X86 ; RUN: llc < %s -mtriple=i686-w64-mingw32-none-elf | FileCheck %s -check-prefix=CHECK-X86-ELF diff --git a/llvm/test/CodeGen/X86/misched-crash.ll b/llvm/test/CodeGen/X86/misched-crash.ll index 98818d9a102f..a421faba95f7 100644 --- a/llvm/test/CodeGen/X86/misched-crash.ll +++ b/llvm/test/CodeGen/X86/misched-crash.ll @@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx10" ; This function contains a cmp instruction with two users. ; Hoisting the last use requires trimming the EFLAGS live range to the second. -define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2) { +define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2, i1 %arg) { entry: %cmp = icmp ult i64 %_x1, %_x2 %cond = select i1 %cmp, i64 %_x1, i64 %_x2 @@ -33,7 +33,7 @@ for.body34.i: ; preds = %for.inc39.i, %if.th for.inc39.i: ; preds = %for.body34.i %inc41.i = add i64 %index.178.i, 1 - br i1 undef, label %return, label %for.body34.i + br i1 %arg, label %return, label %for.body34.i return: ; preds = %for.inc39.i, %for.body34.i, %land.lhs.true21, %entry ret void diff --git a/llvm/test/CodeGen/X86/pr10475.ll b/llvm/test/CodeGen/X86/pr10475.ll index 4dd5aab499ca..4275dc262c37 100644 --- a/llvm/test/CodeGen/X86/pr10475.ll +++ b/llvm/test/CodeGen/X86/pr10475.ll @@ -2,19 +2,19 @@ ; No check in a crash test -define void @autogen_262380_1000() { +define void @autogen_262380_1000(i1 %arg) { BB: br label %CF79 CF79: ; preds = %CF79, %BB - br i1 undef, label %CF79, label %CF84.critedge.critedge + br i1 %arg, label %CF79, label %CF84.critedge.critedge CF84.critedge.critedge: ; preds = %CF79 %L35 = load <8 x i32>, ptr undef br label %CF85 CF85: ; preds = %CF85, %CF84.critedge.critedge - br i1 undef, label %CF85, label %CF86 + br i1 %arg, label %CF85, label %CF86 CF86: ; preds = %CF86, %CF85 %B61 = sub <8 x i32> %L35, zeroinitializer @@ -23,7 +23,7 @@ CF86: ; preds = %CF86, %CF85 br i1 %E73, label %CF86, label %CF87 CF87: ; preds = %CF87, %CF86 - br i1 undef, label %CF87, label %CF88 + br i1 %arg, label %CF87, label %CF88 CF88: ; preds = %CF87 ret void diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll new file mode 100644 index 000000000000..d5119d45f97c --- /dev/null +++ b/llvm/test/CodeGen/X86/pr107423.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s + +define void @PR107423(<64 x i8> %arg, ptr %p0) { +; CHECK-LABEL: PR107423: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm2, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm3 +; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm4 +; CHECK-NEXT: vpaddb %xmm1, %xmm4, %xmm1 +; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm4 +; CHECK-NEXT: vpsllw $8, %xmm4, %xmm4 +; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsllw $8, %xmm1, %xmm1 +; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu %xmm0, 16(%rdi) +; CHECK-NEXT: vmovdqu %xmm2, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %i3 = bitcast <64 x i8> %arg to <32 x i16> + %i4 = shufflevector <32 x i16> %i3, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %i5 = shl <8 x i16> %i4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %i6 = bitcast <8 x i16> %i5 to <16 x i8> + %i7 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> + %i8 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i9 = shufflevector <64 x i8> %i7, <64 x i8> %i8, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %i10 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i11 = shufflevector <64 x i8> %i10, <64 x i8> %i9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> + %i12 = bitcast <64 x i8> %i11 to <32 x i16> + %i13 = shl <32 x i16> %i12, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %i14 = bitcast <32 x i16> %i13 to <64 x i8> + %i15 = shufflevector <64 x i8> %i14, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %i16 = shufflevector <64 x i8> %i11, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> + %i17 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i18 = shufflevector <64 x i8> %i16, <64 x i8> %i17, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %i19 = shufflevector <16 x i8> %i15, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i20 = shufflevector <64 x i8> %i19, <64 x i8> %i18, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> + %i21 = add <64 x i8> %i20, %i11 + %i22 = bitcast <64 x i8> %i21 to <32 x i16> + %i23 = shl <32 x i16> %i22, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %i24 = bitcast <32 x i16> %i23 to <64 x i8> + %i25 = shufflevector <64 x i8> %i24, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %i26 = bitcast <32 x i16> %i23 to <64 x i8> + %i28 = shufflevector <64 x i8> %i26, <64 x i8> poison, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> + %i32 = shufflevector <64 x i8> %i21, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %i33 = shufflevector <16 x i8> %i25, <16 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i34 = shufflevector <64 x i8> %i32, <64 x i8> %i33, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %i35 = shufflevector <16 x i8> %i28, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i36 = shufflevector <64 x i8> %i35, <64 x i8> %i34, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> + %i37 = add <64 x i8> %i36, %i21 + %i38 = bitcast <64 x i8> %i37 to <32 x i16> + %i39 = shufflevector <32 x i16> %i38, <32 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %i40 = shl <8 x i16> %i39, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %i41 = bitcast <8 x i16> %i40 to <16 x i8> + %i42 = shufflevector <16 x i8> %i41, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i43 = shufflevector <64 x i8> %i42, <64 x i8> %i37, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> + %i44 = bitcast <64 x i8> %i43 to <32 x i16> + %i45 = shufflevector <32 x i16> %i44, <32 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %i46 = shl <8 x i16> %i45, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %i47 = bitcast <8 x i16> %i46 to <16 x i8> + %i48 = shufflevector <16 x i8> %i47, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> + %i49 = shufflevector <64 x i8> %i43, <64 x i8> %i48, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79> + %i50 = shufflevector <64 x i8> %i37, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %i51 = add <32 x i8> %i49, %i50 + store <32 x i8> %i51, ptr %p0, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/pr11998.ll b/llvm/test/CodeGen/X86/pr11998.ll index caaf2710fba8..4b93c20e7c23 100644 --- a/llvm/test/CodeGen/X86/pr11998.ll +++ b/llvm/test/CodeGen/X86/pr11998.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-- -mattr=+avx -define void @autogen_51367_5000(i8) { +define void @autogen_51367_5000(i8, i1 %arg) { BB: %B = srem i8 55, %0 %B9 = shl i8 %B, %B br label %CF CF: ; preds = %CF, %BB - br i1 undef, label %CF, label %CF403 + br i1 %arg, label %CF, label %CF403 CF403: ; preds = %CF403, %CF %S44 = icmp eq i8 %B9, %0 diff --git a/llvm/test/CodeGen/X86/pr32108.ll b/llvm/test/CodeGen/X86/pr32108.ll index 32f8a7657a3f..a50b9a676ae2 100644 --- a/llvm/test/CodeGen/X86/pr32108.ll +++ b/llvm/test/CodeGen/X86/pr32108.ll @@ -13,7 +13,7 @@ BB: br label %CF243 CF243: ; preds = %CF243, %BB - br i1 undef, label %CF243, label %CF257 + br i1 poison, label %CF243, label %CF257 CF257: ; preds = %CF243 %Shuff144 = shufflevector <4 x i1> undef, <4 x i1> %Cmp45, <4 x i32> <i32 undef, i32 undef, i32 5, i32 undef> diff --git a/llvm/test/CodeGen/X86/pr50254.ll b/llvm/test/CodeGen/X86/pr50254.ll index 01d261a3fd4b..95b7ae5e3e02 100644 --- a/llvm/test/CodeGen/X86/pr50254.ll +++ b/llvm/test/CodeGen/X86/pr50254.ll @@ -37,7 +37,7 @@ entry: br label %for.body for.body: ; preds = %entry - br i1 undef, label %for.end, label %for.body.1 + br i1 poison, label %for.end, label %for.body.1 for.end: ; preds = %for.body store i16 %xor1, ptr @d.e, align 4 diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll index 4ca8ae91f9e6..779978b90349 100644 --- a/llvm/test/CodeGen/X86/pr57673.ll +++ b/llvm/test/CodeGen/X86/pr57673.ll @@ -100,7 +100,7 @@ bb_entry: br label %bb_8 bb_8: ; preds = %bb_last, %bb_entry - br i1 undef, label %bb_last, label %bb_mid + br i1 poison, label %bb_last, label %bb_mid bb_mid: ; preds = %bb_8 %i4 = getelementptr inbounds %t10, ptr %i1, i64 0, i32 1, i64 32 diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll index beb42f55b709..47e5079e9c36 100644 --- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -331,13 +331,13 @@ if.end: ] if.then4: - br i1 undef, label %SyTime.exit, label %if.then.i + br i1 poison, label %SyTime.exit, label %if.then.i if.then.i: unreachable SyTime.exit: - br i1 undef, label %SyTime.exit2681, label %if.then.i2673 + br i1 poison, label %SyTime.exit2681, label %if.then.i2673 if.then.i2673: unreachable @@ -349,7 +349,7 @@ land.lhs.true14: unreachable if.end25: - br i1 undef, label %SyTime.exit2720, label %if.then.i2712 + br i1 poison, label %SyTime.exit2720, label %if.then.i2712 if.then.i2712: unreachable @@ -406,7 +406,7 @@ do.end: %mul167 = shl i32 %rep.6, 2 %rep.8 = select i1 %cmp164, i32 %mul167, i32 %rep.6 %..ch.19 = select i1 false, i32 2, i32 0 - br i1 undef, label %while.body200, label %while.end1465 + br i1 poison, label %while.body200, label %while.end1465 while.body200: %dec3386.in = phi i32 [ %dec3386, %while.cond197.backedge ], [ %rep.8, %do.end ] @@ -444,7 +444,7 @@ while.cond1037.preheader: br i1 %cmp10393273, label %if.end1070, label %land.rhs1041 while.cond635.preheader: - br i1 undef, label %for.body643.us, label %while.cond661 + br i1 poison, label %for.body643.us, label %while.cond661 for.body643.us: br label %for.body643.us @@ -488,7 +488,7 @@ land.rhs485: br i1 %isascii.i.i27763151, label %cond.true.i.i2780, label %cond.false.i.i2782 cond.true.i.i2780: - br i1 undef, label %land.lhs.true490, label %lor.rhs500 + br i1 poison, label %land.lhs.true490, label %lor.rhs500 cond.false.i.i2782: unreachable @@ -499,10 +499,10 @@ land.lhs.true490: lor.rhs500: ; Make sure spill is hoisted to a cold preheader in outside loop. %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256) - br i1 undef, label %land.lhs.true504, label %do.body479.backedge + br i1 poison, label %land.lhs.true504, label %do.body479.backedge land.lhs.true504: - br i1 undef, label %do.body479.backedge, label %if.end517 + br i1 poison, label %do.body479.backedge, label %if.end517 do.body479.backedge: %incdec.ptr480 = getelementptr i8, ptr %incdec.ptr4803316, i64 1 @@ -531,10 +531,10 @@ for.cond534: br i1 %cmp536, label %for.cond542.preheader, label %for.cond534 for.cond542.preheader: - br i1 undef, label %for.body545, label %for.end552 + br i1 poison, label %for.body545, label %for.end552 for.body545: - br i1 undef, label %for.end552, label %for.body545 + br i1 poison, label %for.end552, label %for.body545 for.end552: %s.2.lcssa = phi ptr [ undef, %for.cond542.preheader ], [ %q.4, %for.body545 ] @@ -554,7 +554,7 @@ while.cond864: br label %while.cond864 sw.bb956: - br i1 undef, label %if.then959, label %while.cond197.backedge + br i1 poison, label %if.then959, label %while.cond197.backedge if.then959: br label %while.cond962 @@ -600,7 +600,7 @@ while.end1465: ] for.cond1480.preheader: - br i1 undef, label %for.body1606.lr.ph, label %for.end1609 + br i1 poison, label %for.body1606.lr.ph, label %for.end1609 if.then1477: %p.1.lcssa3539 = phi ptr [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ %line, %while.body200 ] @@ -614,7 +614,7 @@ for.body1606.lr.ph: br label %for.end1609 for.end1609: - br i1 undef, label %for.cond1659.preheader, label %land.lhs.true1614 + br i1 poison, label %for.cond1659.preheader, label %land.lhs.true1614 land.lhs.true1614: br label %for.cond1659.preheader @@ -631,13 +631,13 @@ while.body1703.lr.ph: unreachable while.cond1683.preheader: - br i1 undef, label %while.body1691, label %while.end1693 + br i1 poison, label %while.body1691, label %while.end1693 while.body1679: %oldc.43406 = phi i32 [ %inc, %syEchoch.exit3070 ], [ %oldc.1.lcssa, %for.body1664.lr.ph ] %3 = load ptr, ptr %echo.i3101, align 8, !tbaa !6 %call.i3062 = call i32 @fileno(ptr %3) - br i1 undef, label %if.then.i3069, label %syEchoch.exit3070 + br i1 poison, label %if.then.i3069, label %syEchoch.exit3070 if.then.i3069: br label %syEchoch.exit3070 diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll index c9edd3f3e904..cd3d48110772 100644 --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -408,7 +408,7 @@ define dso_local void @PR42880(i32 %t0) { %x = ptrtoint ptr %add.ptr.i94 to i32 %sub2 = sub i32 %x, 0 %div = sdiv exact i32 %sub2, 24 - br i1 undef, label %if, label %then + br i1 poison, label %if, label %then then: %t1 = xor i32 %div, -1 diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll index e10e3dd1cd92..962b833ad9a1 100644 --- a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll +++ b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll @@ -28,7 +28,7 @@ define void @sample_test() { ; CHECK-NEXT: movd %xmm0, (%rax) ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: retq - br i1 undef, label %5, label %1 + br i1 poison, label %5, label %1 ; <label>:1 ; preds = %0 %2 = load <4 x i8>, ptr undef @@ -40,4 +40,3 @@ define void @sample_test() { ; <label>:5 ; preds = %1, %0 ret void } - diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll index 33180a7db893..72406aaa4efa 100644 --- a/llvm/test/CodeGen/X86/stackmap.ll +++ b/llvm/test/CodeGen/X86/stackmap.ll @@ -379,23 +379,23 @@ entry: ; CHECK-NEXT: .short 6 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long -define void @spillSubReg(i64 %arg) #0 { +define void @spillSubReg(i64 %arg, i1 %arg2) #0 { bb: - br i1 undef, label %bb1, label %bb2 + br i1 %arg2, label %bb1, label %bb2 bb1: unreachable bb2: %tmp = load i64, ptr inttoptr (i64 140685446136880 to ptr) - br i1 undef, label %bb16, label %bb17 + br i1 %arg2, label %bb16, label %bb17 bb16: unreachable bb17: %tmp32 = trunc i64 %tmp to i32 - br i1 undef, label %bb60, label %bb61 + br i1 %arg2, label %bb60, label %bb61 bb60: tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index 1489b0295e93..77b1ac094cea 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -1014,7 +1014,7 @@ define void @swifterror_isel(ptr) { ; CHECK-i386-NEXT: retl entry: %swifterror = alloca swifterror ptr, align 8 - br i1 undef, label %5, label %1 + br i1 poison, label %5, label %1 %2 = phi i16 [ %4, %1 ], [ undef, %entry ] %3 = call i1 undef(i16 %2, ptr swiftself %0, ptr nocapture swifterror %swifterror) diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll index 629ba48fcae6..c75819c2fd2c 100644 --- a/llvm/test/CodeGen/X86/switch.ll +++ b/llvm/test/CodeGen/X86/switch.ll @@ -2563,7 +2563,7 @@ define i32 @pr27135(i32 %i) { ; NOOPT-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; NOOPT-NEXT: retq entry: - br i1 undef, label %sw, label %end + br i1 poison, label %sw, label %end sw: switch i32 %i, label %end [ i32 99, label %sw.bb diff --git a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll index ce5613f52309..9afdabd4ce13 100644 --- a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll +++ b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s -define i32 @tail_merge_unreachable(i32 %i) { +define i32 @tail_merge_unreachable(i32 %i, i1 %arg) { entry: - br i1 undef, label %sw, label %end + br i1 %arg, label %sw, label %end sw: switch i32 %i, label %end [ i32 99, label %sw.bb diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll new file mode 100644 index 000000000000..b62a07eec1ce --- /dev/null +++ b/llvm/test/CodeGen/X86/uint_to_half.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512 + +define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) { +; AVX1-LABEL: test_uitofp_v8i32_v8f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_uitofp_v8i32_v8f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_uitofp_v8i32_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = uitofp <8 x i32> %a to <8 x half> + ret <8 x half> %vec +} + +define <8 x half> @test_strict_uitofp_v8i32_v8f16(<8 x i32> %a) { +; AVX1-LABEL: test_strict_uitofp_v8i32_v8f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_strict_uitofp_v8i32_v8f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_strict_uitofp_v8i32_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <8 x half> %vec +} + +define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) { +; AVX1-LABEL: test_uitofp_v16i32_v16f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_uitofp_v16i32_v16f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-NEXT: vsubps %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_uitofp_v16i32_v16f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512-NEXT: retq + %vec = uitofp <16 x i32> %a to <16 x half> + ret <16 x half> %vec +} + +define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) { +; AVX1-LABEL: test_strict_uitofp_v16i32_v16f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_strict_uitofp_v16i32_v16f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-NEXT: vsubps %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_strict_uitofp_v16i32_v16f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512-NEXT: retq + %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <16 x half> %vec +} diff --git a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll index d784425d76d3..b09e2024db78 100644 --- a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll +++ b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll @@ -7,9 +7,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" -define double @fn1(ptr %arg, i64 %arg1) { +define double @fn1(ptr %arg, i64 %arg1, i1 %arg2) { Entry: - br i1 undef, label %Body, label %Exit + br i1 %arg2, label %Body, label %Exit Exit: ; preds = %Brancher7, %Entry ret double undef diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir index d26f79750771..ff5df9ad8885 100644 --- a/llvm/test/CodeGen/X86/update-terminator.mir +++ b/llvm/test/CodeGen/X86/update-terminator.mir @@ -10,14 +10,14 @@ declare void @dummy3() ; Function Attrs: nounwind - define void @f2() { - br i1 undef, label %bb1, label %bb3 + define void @f2(i1 %arg) { + br i1 %arg, label %bb1, label %bb3 bb1: call void @dummy1() call void @dummy1() call void @dummy1() - br i1 undef, label %bb2, label %bb2 + br i1 %arg, label %bb2, label %bb2 bb2: call void @dummy2() diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 460c5fe11f82..78dd2cf783ef 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -517,7 +517,7 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -647,7 +647,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -993,7 +993,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d06993da6365..746c09e5e70d 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -522,7 +522,7 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -652,7 +652,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1010,7 +1010,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index bac118095331..be7888cd76a6 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -604,7 +604,7 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltud %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -730,7 +730,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1046,7 +1046,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index ab75ada72f25..ceb1ad13bc15 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -647,7 +647,7 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) @@ -773,7 +773,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) @@ -1093,7 +1093,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 4f42d5c65528..15e287d66754 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4129,6 +4129,62 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ret <32 x i8> %shuffle } +; PR121823 +define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,9,0,3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15] +; AVX512VLBW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,9,0,3,11,2,5,13,4,7,15,6,17,25,16,19,27,18,21,29,20,23,31,22,56,57,58,59,60,61,62,63] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],xmm1[1,9,0,3] +; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] +; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0] +; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; XOPAVX2-NEXT: retq + %r = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 0, i32 3, i32 11, i32 2, i32 5, i32 13, i32 4, i32 7, i32 15, i32 6, i32 17, i32 25, i32 16, i32 19, i32 27, i32 18, i32 21, i32 29, i32 20, i32 23, i32 31, i32 22, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48> + ret <32 x i8> %r +} + define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index f0b70ae26b1f..4125d7878371 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -190,7 +190,7 @@ define i64 @PR55050() { entry: %i275 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer) %i277 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer) - br i1 undef, label %exit, label %if + br i1 poison, label %exit, label %if if: %i298 = bitcast <2 x i64> %i275 to <4 x i32> diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index fe7459ea45e1..928f29b7b188 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -868,7 +868,7 @@ define void @infiniteloop() { ; DISABLE-NEXT: popq %rbp ; DISABLE-NEXT: retq entry: - br i1 undef, label %if.then, label %if.end + br i1 poison, label %if.then, label %if.end if.then: %ptr = alloca i32, i32 4 @@ -983,7 +983,7 @@ define void @infiniteloop2() { ; DISABLE-NEXT: popq %rbp ; DISABLE-NEXT: retq entry: - br i1 undef, label %if.then, label %if.end + br i1 poison, label %if.then, label %if.end if.then: %ptr = alloca i32, i32 4 @@ -994,7 +994,7 @@ for.body: ; preds = %for.body, %entry %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"() %add = add nsw i32 %call, %sum.03 store i32 %add, ptr %ptr - br i1 undef, label %body1, label %body2 + br i1 poison, label %body1, label %body2 body1: tail call void asm sideeffect "nop", "~{ebx}"() @@ -1074,10 +1074,10 @@ define void @infiniteloop3() { ; DISABLE-NEXT: LBB12_7: ## %end ; DISABLE-NEXT: retq entry: - br i1 undef, label %loop2a, label %body + br i1 poison, label %loop2a, label %body body: ; preds = %entry - br i1 undef, label %loop2a, label %end + br i1 poison, label %loop2a, label %end loop1: ; preds = %loop2a, %loop2b %var.phi = phi ptr [ %next.phi, %loop2b ], [ %var, %loop2a ] |
