summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/div_i128.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/div_i128.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_i128.ll278
1 files changed, 153 insertions, 125 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6686742e449f..d94ec56842ab 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -581,10 +581,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
@@ -592,8 +588,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4
; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
@@ -640,6 +641,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_3
; GFX9-O0-NEXT: .LBB0_5: ; %Flow1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8
+; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9
+; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
@@ -648,15 +656,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9
-; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
@@ -670,6 +672,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
@@ -694,13 +702,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10
-; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11
; GFX9-O0-NEXT: s_mov_b32 s4, 63
+; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29
; GFX9-O0-NEXT: s_mov_b32 s5, 1
@@ -727,6 +730,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27
; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
@@ -736,6 +740,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25
; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
@@ -747,10 +752,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -890,6 +897,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-O0-NEXT: s_branch .LBB0_1
; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
@@ -906,12 +916,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: s_mov_b32 s6, 64
@@ -992,7 +999,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10
; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
@@ -1024,6 +1030,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_6
; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -1032,12 +1041,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_mov_b32 s4, s7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
@@ -1048,7 +1054,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -1152,7 +1158,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8
; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
@@ -1710,10 +1715,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB0_5
; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2
-; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
@@ -1721,10 +1722,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 0
; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 1
; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB0_9
; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
@@ -1784,6 +1792,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB0_3
; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
@@ -1792,17 +1807,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4
-; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5
-; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_nop 0
@@ -1812,6 +1823,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_branch .LBB0_4
; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6
+; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7
; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
@@ -1836,15 +1853,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6
-; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7
; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-G-O0-NEXT: s_mov_b32 s8, 1
@@ -1894,8 +1907,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8
; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v28, v30
; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v32
; GFX9-G-O0-NEXT: v_mov_b32_e32 v25, v33
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v28
@@ -1915,6 +1930,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v15
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v13, s[8:9], v13, v4
; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9]
; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9]
@@ -2027,6 +2043,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-G-O0-NEXT: s_branch .LBB0_1
; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -2044,15 +2063,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-G-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v17
; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v16
; GFX9-G-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v4
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v18, v4
@@ -2063,7 +2079,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v18, v6
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6
; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v6
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v18, v[20:21]
; GFX9-G-O0-NEXT: v_lshrrev_b64 v[25:26], v18, v[22:23]
; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v5, v[20:21]
@@ -2112,7 +2128,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s8, 6
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s9, 7
; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
@@ -2144,6 +2159,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB0_6
; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -2152,20 +2170,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
; GFX9-G-O0-NEXT: s_mov_b32 s6, 1
; GFX9-G-O0-NEXT: s_mov_b32 s10, 0
; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v2, v4
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s9
; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7]
@@ -2257,7 +2272,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(17)
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 4
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 5
; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
@@ -2761,10 +2775,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_5
; GFX9-O0-NEXT: .LBB1_3: ; %Flow2
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
@@ -2772,8 +2782,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2
; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
@@ -2820,6 +2835,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_3
; GFX9-O0-NEXT: .LBB1_5: ; %Flow1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -2828,15 +2850,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7
-; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
@@ -2850,6 +2866,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_4
; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
@@ -2874,13 +2896,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8
-; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9
; GFX9-O0-NEXT: s_mov_b32 s4, 63
+; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29
; GFX9-O0-NEXT: s_mov_b32 s5, 1
@@ -2907,6 +2924,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27
; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
@@ -2916,6 +2934,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25
; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
@@ -2927,10 +2946,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -3070,6 +3091,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6
; GFX9-O0-NEXT: s_branch .LBB1_1
; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
@@ -3086,12 +3110,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[20:21]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: s_mov_b32 s6, 64
@@ -3172,7 +3193,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8
; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -3204,6 +3224,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_6
; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -3212,12 +3235,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_mov_b32 s4, s7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
@@ -3228,7 +3248,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -3332,7 +3352,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6
; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -3793,10 +3812,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB1_5
; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2
-; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
@@ -3804,10 +3819,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 0
; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 1
; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB1_9
; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit
@@ -3867,6 +3889,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB1_3
; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -3875,17 +3904,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT: v_readlane_b32 s4, v34, 4
-; GFX9-G-O0-NEXT: v_readlane_b32 s5, v34, 5
-; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_nop 0
@@ -3895,6 +3920,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_branch .LBB1_4
; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while
; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6
+; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7
; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
@@ -3919,15 +3950,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT: v_readlane_b32 s6, v34, 6
-; GFX9-G-O0-NEXT: v_readlane_b32 s7, v34, 7
; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-G-O0-NEXT: s_mov_b32 s8, 1
@@ -3977,8 +4004,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8
; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v28, v30
; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v32
; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v28
@@ -3998,6 +4027,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4
; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
@@ -4118,6 +4148,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6
; GFX9-G-O0-NEXT: s_branch .LBB1_1
; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
@@ -4135,14 +4168,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-G-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5
; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v7
; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v6
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
@@ -4208,7 +4238,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s8, 6
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s9, 7
; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -4240,6 +4269,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB1_6
; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
@@ -4248,20 +4280,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
; GFX9-G-O0-NEXT: s_mov_b32 s6, 1
; GFX9-G-O0-NEXT: s_mov_b32 s10, 0
; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v1, v4
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s10
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7]
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s9
; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7]
@@ -4353,7 +4382,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(17)
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s6, 4
; GFX9-G-O0-NEXT: v_writelane_b32 v34, s7, 5
; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1