diff options
| author | Koakuma <koachan@protonmail.com> | 2024-07-08 19:19:54 +0700 |
|---|---|---|
| committer | Koakuma <koachan@protonmail.com> | 2024-07-08 19:19:54 +0700 |
| commit | 5c4fdc2fd5898ebd9e89999a4f4b8aa289ca637f (patch) | |
| tree | f3b92a07f3dfc6e70f36d1000605f36a3c15af46 /llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll | |
| parent | dbda8e2f2cd8764e0badd983915d62a2c3377f4d (diff) | |
| parent | e9b8cd0c806db00f0981fb36717077c941426302 (diff) | |
[𝘀𝗽𝗿] changes introduced through rebaseusers/koachan/spr/main.sparcias-enable-parseforallfeatures-in-matchoperandparserimpl
Created using spr 1.3.5
[skip ci]
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll | 849 |
1 files changed, 377 insertions, 472 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 526ee5a51745..3dea5cb5c142 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -477,42 +477,38 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: clpeak_imad_pat_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: clpeak_imad_pat_v2i16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX11-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i16> %x, <i16 1, i16 1> @@ -691,22 +687,20 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v4, v1, v3, v1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v5, v0, v2, v0 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v6, v5, v2 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v7, v4, v3 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v3, v4, v3, 1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: clpeak_imad_pat_v3i16: @@ -714,41 +708,37 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: clpeak_imad_pat_v3i16: @@ -756,47 +746,41 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: clpeak_imad_pat_v3i16: @@ -805,25 +789,21 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1> @@ -1088,22 +1068,20 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v4, v1, v3, v1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v5, v0, v2, v0 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v6, v5, v2 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v7, v4, v3 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v3, v4, v3, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: clpeak_imad_pat_v4i16: @@ -1111,41 +1089,37 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: clpeak_imad_pat_v4i16: @@ -1153,47 +1127,41 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: clpeak_imad_pat_v4i16: @@ -1202,25 +1170,21 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1> @@ -1517,42 +1481,38 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: clpeak_umad_pat_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: clpeak_umad_pat_v2i16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX11-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i16> %x, <i16 1, i16 1> @@ -1731,22 +1691,20 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v4, v1, v3, v1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v5, v0, v2, v0 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v6, v5, v2 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v7, v4, v3 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v3, v4, v3, 1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: clpeak_umad_pat_v3i16: @@ -1754,41 +1712,37 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: clpeak_umad_pat_v3i16: @@ -1796,47 +1750,41 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: clpeak_umad_pat_v3i16: @@ -1845,25 +1793,21 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1> @@ -2128,22 +2072,20 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v4, v1, v3, v1 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v5, v0, v2, v0 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v6, v5, v2 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v7, v4, v3 +; GFX9-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v3, v4, v3, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0] +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: clpeak_umad_pat_v4i16: @@ -2151,41 +2093,37 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX9-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: clpeak_umad_pat_v4i16: @@ -2193,47 +2131,41 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX10-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX11-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: clpeak_umad_pat_v4i16: @@ -2242,25 +2174,21 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1> @@ -7192,69 +7120,58 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v1, v2, v0, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mad_u16 v0, v2, v0, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX10-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX11-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v1, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX11-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i16> %x, <i16 1, i16 1> @@ -7500,69 +7417,58 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v1, v2, v0, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mad_u16 v0, v2, v0, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX10-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX10-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX10-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX11-NEXT: v_pk_mad_u16 v2, v0, v1, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v1, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_mad_u16 v2, v1, v0, v0 +; GFX11-NEXT: v_pk_mad_u16 v0, v1, v0, 1 op_sel_hi:[1,1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v3, v2, v1 +; GFX11-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v3, v0 ; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i16> %x, <i16 1, i16 1> @@ -7955,26 +7861,26 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX9-LABEL: multi_use_mul_mad_v2i16_var: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: v_pk_mad_u16 v2, v0, v1, v2 +; GFX9-NEXT: v_pk_mad_u16 v1, v0, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: multi_use_mul_mad_v2i16_var: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX10-NEXT: v_pk_mad_u16 v2, v0, v1, v2 +; GFX10-NEXT: v_pk_mad_u16 v1, v0, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: multi_use_mul_mad_v2i16_var: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX11-NEXT: v_pk_mad_u16 v2, v0, v1, v2 +; GFX11-NEXT: v_pk_mad_u16 v1, v0, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul <2 x i16> %x, %y @@ -8067,28 +7973,27 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX9-LABEL: other_use_mul_mad_v2i16_var: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX9-NEXT: ds_write_b32 v3, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v1 +; GFX9-NEXT: v_pk_mad_u16 v0, v0, v1, v2 +; GFX9-NEXT: ds_write_b32 v3, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: other_use_mul_mad_v2i16_var: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX10-NEXT: ds_write_b32 v3, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v4, v0, v1 +; GFX10-NEXT: v_pk_mad_u16 v0, v0, v1, v2 +; GFX10-NEXT: ds_write_b32 v3, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: other_use_mul_mad_v2i16_var: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX11-NEXT: ds_store_b32 v3, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v4, v0, v1 +; GFX11-NEXT: v_pk_mad_u16 v0, v0, v1, v2 +; GFX11-NEXT: ds_store_b32 v3, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: |
