summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/NVPTX/math-intrins.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/NVPTX/math-intrins.ll')
-rw-r--r--llvm/test/CodeGen/NVPTX/math-intrins.ll416
1 files changed, 414 insertions, 2 deletions
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index e9635e939398..5a55fa97033b 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -3,8 +3,8 @@
; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 | FileCheck %s --check-prefixes=CHECK,CHECK-F16
; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-SM80-NOF16
; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
-; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
-; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-sm_80 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-sm_80 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %}
target triple = "nvptx64-nvidia-cuda"
@@ -42,6 +42,14 @@ declare half @llvm.maximum.f16(half, half) #0
declare float @llvm.maximum.f32(float, float) #0
declare double @llvm.maximum.f64(double, double) #0
declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.minimumnum.f16(half, half) #0
+declare float @llvm.minimumnum.f32(float, float) #0
+declare double @llvm.minimumnum.f64(double, double) #0
+declare <2 x half> @llvm.minimumnum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.maximumnum.f16(half, half) #0
+declare float @llvm.maximumnum.f32(float, float) #0
+declare double @llvm.maximumnum.f64(double, double) #0
+declare <2 x half> @llvm.maximumnum.v2f16(<2 x half>, <2 x half>) #0
declare float @llvm.fma.f32(float, float, float) #0
declare double @llvm.fma.f64(double, double, double) #0
@@ -1486,6 +1494,410 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
ret <2 x half> %x
}
+; ---- minimumnum ----
+
+define half @minimumnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: minimumnum_half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0];
+; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: minimumnum_half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0];
+; CHECK-F16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1];
+; CHECK-F16-NEXT: min.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimumnum_half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call half @llvm.minimumnum.f16(half %a, half %b)
+ ret half %x
+}
+
+define float @minimumnum_float(float %a, float %b) {
+; CHECK-LABEL: minimumnum_float(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_param_1];
+; CHECK-NEXT: min.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.minimumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define float @minimumnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: minimumnum_float_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_ftz_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_ftz_param_1];
+; CHECK-NEXT: min.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.minimumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define double @minimumnum_double(double %a, double %b) {
+; CHECK-LABEL: minimumnum_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [minimumnum_double_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [minimumnum_double_param_1];
+; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
+ %x = call double @llvm.minimumnum.f64(double %a, double %b)
+ ret double %x
+}
+
+; TODO Improve the "Expand" path for minimumnum vectors on targets where
+; f16 is not supported. Ideally it should use two f32 minimumnums first instead of
+; fully expanding the minimumnum instruction into compare/select instructions.
+define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: minimumnum_v2half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
+; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
+; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
+; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
+; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: minimumnum_v2half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b32 %r1, [minimumnum_v2half_param_0];
+; CHECK-F16-NEXT: ld.param.b32 %r2, [minimumnum_v2half_param_1];
+; CHECK-F16-NEXT: min.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimumnum_v2half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %x
+}
+
+; ---- maximumnum ----
+
+define half @maximumnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: maximumnum_half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0];
+; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: maximumnum_half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0];
+; CHECK-F16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1];
+; CHECK-F16-NEXT: max.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximumnum_half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
+; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3;
+; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call half @llvm.maximumnum.f16(half %a, half %b)
+ ret half %x
+}
+
+define float @maximumnum_float(float %a, float %b) {
+; CHECK-LABEL: maximumnum_float(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_param_1];
+; CHECK-NEXT: max.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.maximumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define float @maximumnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: maximumnum_float_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_ftz_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_ftz_param_1];
+; CHECK-NEXT: max.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %x = call float @llvm.maximumnum.f32(float %a, float %b)
+ ret float %x
+}
+
+define double @maximumnum_double(double %a, double %b) {
+; CHECK-LABEL: maximumnum_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [maximumnum_double_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [maximumnum_double_param_1];
+; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
+ %x = call double @llvm.maximumnum.f64(double %a, double %b)
+ ret double %x
+}
+
+; TODO Improve the "Expand" path for maximumnum vectors on targets where
+; f16 is not supported. Ideally it should use two f32 maximumnums first instead of
+; fully expanding the maximumnum instruction into compare/select instructions.
+define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: maximumnum_v2half(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
+; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
+; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
+; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
+; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-LABEL: maximumnum_v2half(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b32 %r1, [maximumnum_v2half_param_0];
+; CHECK-F16-NEXT: ld.param.b32 %r2, [maximumnum_v2half_param_1];
+; CHECK-F16-NEXT: max.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximumnum_v2half(
+; CHECK-SM80-NOF16: {
+; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT: // %bb.0:
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
+; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
+; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
+; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: ret;
+ %x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %x
+}
+
; ---- fma ----
define float @fma_float(float %a, float %b, float %c) {