diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/div_i128.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/div_i128.ll | 278 |
1 files changed, 153 insertions, 125 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 6686742e449f..d94ec56842ab 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -581,10 +581,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] @@ -592,8 +588,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -640,6 +641,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -648,15 +656,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -670,6 +672,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -694,13 +702,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 @@ -727,6 +730,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -736,6 +740,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -747,10 +752,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -890,6 +897,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload @@ -906,12 +916,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s6, 64 @@ -992,7 +999,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 @@ -1024,6 +1030,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1032,12 +1041,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 @@ -1048,7 +1054,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc @@ -1152,7 +1158,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 @@ -1710,10 +1715,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_5 ; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -1721,10 +1722,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 0 ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 1 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_9 ; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit @@ -1784,6 +1792,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_3 ; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload @@ -1792,17 +1807,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 @@ -1812,6 +1823,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_branch .LBB0_4 ; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7 ; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload @@ -1836,15 +1853,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 @@ -1894,8 +1907,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v28, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v32 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v25, v33 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v28 @@ -1915,6 +1930,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v13, s[8:9], v13, v4 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9] ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9] @@ -2027,6 +2043,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-G-O0-NEXT: s_branch .LBB0_1 ; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -2044,15 +2063,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v17 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v16 ; GFX9-G-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v18, v4 @@ -2063,7 +2079,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v18, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v18, v[20:21] ; GFX9-G-O0-NEXT: v_lshrrev_b64 v[25:26], v18, v[22:23] ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v5, v[20:21] @@ -2112,7 +2128,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s8, 6 ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s9, 7 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -2144,6 +2159,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_6 ; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload @@ -2152,20 +2170,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v2, v4 ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7] @@ -2257,7 +2272,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(17) ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 4 ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -2761,10 +2775,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2772,8 +2782,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2820,6 +2835,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -2828,15 +2850,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2850,6 +2866,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -2874,13 +2896,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 @@ -2907,6 +2924,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -2916,6 +2934,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 @@ -2927,10 +2946,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -3070,6 +3091,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -3086,12 +3110,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s6, 64 @@ -3172,7 +3193,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -3204,6 +3224,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -3212,12 +3235,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 @@ -3228,7 +3248,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc @@ -3332,7 +3352,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -3793,10 +3812,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_5 ; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3804,10 +3819,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 0 ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 1 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_9 ; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit @@ -3867,6 +3889,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_3 ; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -3875,17 +3904,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 @@ -3895,6 +3920,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_branch .LBB1_4 ; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7 ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload @@ -3919,15 +3950,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 @@ -3977,8 +4004,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v28, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v32 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v28 @@ -3998,6 +4027,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] @@ -4118,6 +4148,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-G-O0-NEXT: s_branch .LBB1_1 ; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload @@ -4135,14 +4168,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 @@ -4208,7 +4238,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s8, 6 ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s9, 7 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -4240,6 +4269,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_6 ; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload @@ -4248,20 +4280,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v1, v4 ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7] @@ -4353,7 +4382,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(17) ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 4 ; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 |
