summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2024-07-15 09:59:07 +0400
committerMatt Arsenault <arsenm2@gmail.com>2024-07-15 11:51:44 +0400
commitb1bcb7ca460fcd317bbc8309e14c8761bf8394e0 (patch)
treecf2636217534435b2de9783a7cf8e9325819e658 /llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
parent71051deff27928cff908ea794e09806eee662801 (diff)
Reapply "AMDGPU: Move attributor into optimization pipeline (#83131)" and follow up commit "clang/AMDGPU: Defeat attribute optimization in attribute test" (#98851)
This reverts commit adaff46d087799072438dd744b038e6fd50a2d78. Drop the -O3 checks from default-attributes.hip. I don't know why they are different on some bots but reverting this is far too disruptive.
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll2219
1 files changed, 1160 insertions, 1059 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index b0b40aa952a9..3784af443c7f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -24,7 +24,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -35,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB0_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
@@ -52,7 +52,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB0_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB0_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -80,7 +80,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -90,8 +90,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB0_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -107,7 +107,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -119,8 +119,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
@@ -131,24 +132,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: add_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s3, s3, 5
-; GFX1032-NEXT: v_mov_b32_e32 v2, s3
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
@@ -160,7 +162,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-LABEL: add_i32_constant:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -177,8 +179,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -192,24 +194,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: add_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_mul_i32 s3, s3, 5
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1
; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -232,12 +234,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX7LESS-LABEL: add_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
-; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb
+; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -249,8 +251,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB1_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
@@ -262,13 +264,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -280,8 +282,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
@@ -293,13 +295,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -310,8 +312,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
@@ -323,13 +325,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
;
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -342,8 +344,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
@@ -354,39 +357,40 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
;
; GFX1032-LABEL: add_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c
; GFX1032-NEXT: s_mov_b32 s4, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s4, s2, s4
+; GFX1032-NEXT: s_mul_i32 s4, s0, s4
; GFX1032-NEXT: v_mov_b32_e32 v2, s4
; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c
+; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -404,8 +408,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -419,9 +423,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
;
; GFX1132-LABEL: add_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c
; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -430,22 +434,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s4, s2, s4
+; GFX1132-NEXT: s_mul_i32 s4, s0, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -460,7 +464,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: add_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -473,27 +477,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB2_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -502,8 +506,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB2_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -514,27 +518,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB2_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -542,8 +546,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -554,26 +558,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_add_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB2_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -583,8 +587,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
@@ -595,36 +600,37 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_add_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_add_i32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB2_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
@@ -635,43 +641,45 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: add_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_add_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB2_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -681,41 +689,42 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: add_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_add_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_add_i32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB2_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -854,16 +863,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
;
; GFX1164-LABEL: add_i32_varying_nouse:
; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s2, 0
; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: s_add_i32 s2, s2, s6
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
@@ -886,16 +896,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
;
; GFX1132-LABEL: add_i32_varying_nouse:
; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: s_add_i32 s0, s0, s3
; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
@@ -929,7 +940,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -940,8 +951,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB4_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
@@ -962,7 +973,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB4_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -973,10 +984,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
@@ -994,7 +1005,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1004,10 +1015,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
@@ -1025,7 +1036,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1037,8 +1048,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
@@ -1050,24 +1062,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: add_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s3, s3, 5
-; GFX1032-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
@@ -1080,7 +1093,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-LABEL: add_i64_constant:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -1097,8 +1110,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1113,25 +1126,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: add_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132-NEXT: s_mul_i32 s3, s3, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132-NEXT: v_mov_b32_e32 v0, s1
; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1155,7 +1168,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX7LESS-LABEL: add_i64_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
@@ -1196,7 +1209,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
@@ -1234,7 +1247,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX9-LABEL: add_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[6:7], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
@@ -1272,7 +1285,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX1064-LABEL: add_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
@@ -1308,7 +1321,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX1032-LABEL: add_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
@@ -1343,7 +1356,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX1164-LABEL: add_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1164-NEXT: s_mov_b64 s[6:7], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
@@ -1384,7 +1397,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX1132-LABEL: add_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132-NEXT: s_mov_b32 s5, exec_lo
; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
@@ -1432,7 +1445,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: add_i64_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -1447,7 +1460,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1459,7 +1472,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9-LABEL: add_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1471,7 +1484,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1481,20 +1494,36 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: add_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX1164-LABEL: add_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: add_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -1513,7 +1542,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1524,8 +1553,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB7_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
@@ -1542,7 +1571,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB7_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1553,8 +1582,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB7_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
@@ -1571,7 +1600,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB7_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1581,8 +1610,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
@@ -1599,7 +1628,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1611,8 +1640,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@@ -1624,24 +1654,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: sub_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s3, s3, 5
-; GFX1032-NEXT: v_mov_b32_e32 v2, s3
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@@ -1654,7 +1685,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-LABEL: sub_i32_constant:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1671,8 +1702,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB7_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -1687,24 +1718,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: sub_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB7_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_mul_i32 s3, s3, 5
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1
; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB7_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
@@ -1728,12 +1759,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX7LESS-LABEL: sub_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
-; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb
+; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1745,8 +1776,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB8_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
@@ -1758,13 +1789,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
;
; GFX8-LABEL: sub_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB8_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1776,8 +1807,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB8_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
@@ -1789,13 +1820,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
;
; GFX9-LABEL: sub_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1806,8 +1837,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB8_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
@@ -1819,13 +1850,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
;
; GFX1064-LABEL: sub_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -1838,8 +1869,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
@@ -1851,40 +1882,40 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
;
; GFX1032-LABEL: sub_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c
; GFX1032-NEXT: s_mov_b32 s4, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s4, s2, s4
+; GFX1032-NEXT: s_mul_i32 s4, s0, s4
; GFX1032-NEXT: v_mov_b32_e32 v2, s4
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c
+; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1902,8 +1933,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
@@ -1918,9 +1949,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
;
; GFX1132-LABEL: sub_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c
; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1929,23 +1960,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s4, s2, s4
+; GFX1132-NEXT: s_mul_i32 s4, s0, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1960,7 +1991,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: sub_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -1973,27 +2004,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB9_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB9_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -2002,8 +2033,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB9_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -2014,27 +2045,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB9_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB9_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -2042,8 +2073,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -2054,26 +2085,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: sub_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_add_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB9_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -2083,8 +2114,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB9_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
@@ -2095,36 +2127,37 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: sub_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_add_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_add_i32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB9_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB9_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
@@ -2135,43 +2168,45 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: sub_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_add_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB9_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -2181,41 +2216,42 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: sub_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_add_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_add_i32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB9_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -2354,16 +2390,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
;
; GFX1164-LABEL: sub_i32_varying_nouse:
; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s2, 0
; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: s_add_i32 s2, s2, s6
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
@@ -2386,16 +2423,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
;
; GFX1132-LABEL: sub_i32_varying_nouse:
; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
; GFX1132-NEXT: s_mov_b32 s0, 0
; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: s_add_i32 s0, s0, s3
; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
@@ -2429,7 +2467,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -2440,8 +2478,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB11_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
@@ -2462,7 +2500,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB11_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -2473,8 +2511,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB11_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -2495,7 +2533,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB11_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -2505,8 +2543,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB11_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -2527,7 +2565,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -2539,8 +2577,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
@@ -2555,24 +2594,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: sub_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s3, s3, 5
-; GFX1032-NEXT: v_mov_b32_e32 v0, s3
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
@@ -2588,7 +2628,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-LABEL: sub_i64_constant:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -2605,8 +2645,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB11_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
@@ -2624,25 +2664,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: sub_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB11_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132-NEXT: s_mul_i32 s3, s3, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_mov_b32_e32 v0, s3
+; GFX1132-NEXT: v_mov_b32_e32 v0, s1
; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB11_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
@@ -2669,7 +2709,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX7LESS-LABEL: sub_i64_uniform:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
@@ -2710,7 +2750,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
@@ -2749,7 +2789,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX9-LABEL: sub_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[6:7], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
@@ -2789,7 +2829,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX1064-LABEL: sub_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
@@ -2828,7 +2868,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX1032-LABEL: sub_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
@@ -2866,7 +2906,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX1164-LABEL: sub_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1164-NEXT: s_mov_b64 s[6:7], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
@@ -2909,7 +2949,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX1132-LABEL: sub_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132-NEXT: s_mov_b32 s5, exec_lo
; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
@@ -2959,7 +2999,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: sub_i64_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -2974,7 +3014,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2986,7 +3026,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9-LABEL: sub_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2998,7 +3038,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3008,20 +3048,36 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: sub_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX1164-LABEL: sub_i64_varying:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: buffer_gl0_inv
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: sub_i64_varying:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: buffer_gl0_inv
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
@@ -3035,7 +3091,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: and_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -3048,27 +3104,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: and_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, -1
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB14_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_and_b32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB14_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -3077,8 +3133,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB14_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -3089,27 +3145,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: and_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, -1
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_and_b32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB14_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -3117,8 +3173,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB14_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -3129,26 +3185,26 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: and_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, -1
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_and_b32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB14_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -3158,8 +3214,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB14_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1
@@ -3170,36 +3227,37 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: and_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, -1
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_and_b32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_and_b32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB14_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB14_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1
@@ -3210,43 +3268,45 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: and_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, -1
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_and_b32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB14_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX1164-NEXT: ds_and_rtn_b32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB14_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1
+; GFX1164-NEXT: v_and_b32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3256,41 +3316,42 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: and_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, -1
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_and_b32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_and_b32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB14_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_and_rtn_b32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB14_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1
+; GFX1132-NEXT: v_and_b32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3309,7 +3370,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: or_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -3322,27 +3383,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: or_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB15_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_or_b32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB15_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -3351,8 +3412,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB15_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -3363,27 +3424,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: or_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_or_b32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB15_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -3391,8 +3452,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB15_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -3403,26 +3464,26 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: or_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_or_b32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB15_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -3432,8 +3493,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB15_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1
@@ -3444,36 +3506,37 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: or_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_or_b32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_or_b32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB15_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB15_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1
@@ -3484,43 +3547,45 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: or_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_or_b32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB15_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX1164-NEXT: ds_or_rtn_b32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB15_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1
+; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3530,41 +3595,42 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: or_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_or_b32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_or_b32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB15_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_or_rtn_b32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB15_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1
+; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3583,7 +3649,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: xor_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -3596,27 +3662,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: xor_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB16_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_xor_b32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB16_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -3625,8 +3691,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB16_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -3637,27 +3703,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: xor_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB16_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_xor_b32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB16_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -3665,8 +3731,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB16_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -3677,26 +3743,26 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: xor_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_xor_b32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB16_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -3706,8 +3772,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB16_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1
@@ -3718,36 +3785,37 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: xor_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_xor_b32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_xor_b32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB16_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB16_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1
@@ -3758,43 +3826,45 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: xor_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_xor_b32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB16_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX1164-NEXT: ds_xor_rtn_b32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB16_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1
+; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3804,41 +3874,42 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: xor_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_xor_b32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_xor_b32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB16_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_xor_rtn_b32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB16_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1
+; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3857,7 +3928,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: max_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -3870,27 +3941,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: max_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_brev_b32 s4, 1
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB17_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_max_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB17_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -3899,8 +3970,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB17_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -3911,27 +3982,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: max_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_brev_b32 s4, 1
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_max_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB17_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -3939,8 +4010,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB17_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -3951,26 +4022,26 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: max_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_brev_b32 s4, 1
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_max_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB17_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -3980,8 +4051,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB17_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1
@@ -3992,36 +4064,37 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: max_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_brev_b32 s2, 1
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_brev_b32 s0, 1
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_max_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_max_i32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB17_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB17_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1
@@ -4032,43 +4105,45 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: max_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_brev_b32 s4, 1
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_max_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB17_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX1164-NEXT: ds_max_rtn_i32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB17_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1
+; GFX1164-NEXT: v_max_i32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4078,41 +4153,42 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: max_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_brev_b32 s2, 1
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_brev_b32 s0, 1
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_max_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_max_i32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB17_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_max_rtn_i32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB17_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1
+; GFX1132-NEXT: v_max_i32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4135,7 +4211,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
@@ -4145,8 +4221,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB18_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
@@ -4169,7 +4245,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB18_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
@@ -4179,10 +4255,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB18_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
@@ -4203,7 +4279,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB18_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
@@ -4212,10 +4288,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB18_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
@@ -4236,7 +4312,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB18_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
@@ -4247,8 +4323,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB18_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
@@ -4267,7 +4344,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB18_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
@@ -4278,8 +4355,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB18_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
@@ -4300,7 +4378,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_cbranch_execz .LBB18_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
@@ -4310,8 +4388,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB18_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
@@ -4334,7 +4412,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132-NEXT: s_cbranch_execz .LBB18_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
@@ -4343,8 +4421,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB18_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
@@ -4371,7 +4449,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: min_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -4384,27 +4462,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: min_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_brev_b32 s4, -2
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB19_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_min_i32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB19_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -4413,8 +4491,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB19_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -4425,27 +4503,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: min_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB19_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_min_i32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB19_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -4453,8 +4531,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB19_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -4465,26 +4543,26 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: min_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_brev_b32 s4, -2
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_min_i32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB19_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -4494,8 +4572,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB19_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1
@@ -4506,36 +4585,37 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: min_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_brev_b32 s2, -2
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_brev_b32 s0, -2
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_min_i32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_min_i32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB19_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB19_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1
@@ -4546,43 +4626,45 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: min_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_brev_b32 s4, -2
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_min_i32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB19_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX1164-NEXT: ds_min_rtn_i32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB19_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1
+; GFX1164-NEXT: v_min_i32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4592,41 +4674,42 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: min_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_brev_b32 s2, -2
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_brev_b32 s0, -2
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_min_i32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_min_i32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB19_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_min_rtn_i32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB19_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1
+; GFX1132-NEXT: v_min_i32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4649,7 +4732,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
@@ -4659,8 +4742,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB20_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
@@ -4683,7 +4766,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB20_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
@@ -4693,10 +4776,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB20_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
@@ -4717,7 +4800,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB20_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
@@ -4726,10 +4809,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB20_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
@@ -4750,7 +4833,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB20_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
@@ -4761,8 +4844,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB20_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
@@ -4781,7 +4865,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB20_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
@@ -4792,8 +4876,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB20_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
@@ -4814,7 +4899,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_cbranch_execz .LBB20_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
@@ -4824,8 +4909,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB20_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
@@ -4848,7 +4933,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132-NEXT: s_cbranch_execz .LBB20_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
@@ -4857,8 +4942,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB20_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
@@ -4885,7 +4970,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: umax_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -4898,27 +4983,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: umax_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB21_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_max_u32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB21_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -4927,8 +5012,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB21_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -4939,27 +5024,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: umax_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB21_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_max_u32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB21_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -4967,8 +5052,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB21_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -4979,26 +5064,26 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: umax_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_max_u32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB21_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -5008,8 +5093,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB21_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1
@@ -5020,36 +5106,37 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: umax_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_max_u32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_max_u32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB21_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB21_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1
@@ -5060,43 +5147,45 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: umax_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, 0
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_max_u32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB21_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1164-NEXT: ds_max_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB21_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1164-NEXT: v_max_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -5106,41 +5195,42 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: umax_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_max_u32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_max_u32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB21_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_max_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB21_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1
+; GFX1132-NEXT: v_max_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -5163,7 +5253,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
@@ -5173,8 +5263,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB22_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
@@ -5196,7 +5286,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB22_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
@@ -5206,8 +5296,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB22_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
@@ -5229,7 +5319,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB22_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
@@ -5238,8 +5328,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB22_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -5261,7 +5351,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB22_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
@@ -5272,8 +5362,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -5292,7 +5383,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB22_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
@@ -5303,8 +5394,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -5325,7 +5417,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_cbranch_execz .LBB22_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
@@ -5335,8 +5427,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB22_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -5359,7 +5451,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132-NEXT: s_cbranch_execz .LBB22_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
@@ -5368,8 +5460,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB22_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
@@ -5396,7 +5488,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX7LESS-LABEL: umin_i32_varying:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -5409,27 +5501,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX8-LABEL: umin_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
; GFX8-NEXT: s_mov_b32 s4, -1
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB23_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_min_u32 s4, s4, s8
-; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB23_4
; GFX8-NEXT: ; %bb.3:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
@@ -5438,8 +5530,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB23_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -5450,27 +5542,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX9-LABEL: umin_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_mov_b32 s4, -1
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB23_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_min_u32 s4, s4, s8
-; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB23_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -5478,8 +5570,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB23_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -5490,26 +5582,26 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064-LABEL: umin_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: s_mov_b32 s4, -1
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s5
; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX1064-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064-NEXT: s_min_u32 s4, s4, s8
-; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB23_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -5519,8 +5611,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB23_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1
@@ -5531,36 +5624,37 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032-LABEL: umin_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, -1
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
-; GFX1032-NEXT: s_min_u32 s2, s2, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT: v_writelane_b32 v1, s0, s4
+; GFX1032-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032-NEXT: s_min_u32 s0, s0, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execz .LBB23_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB23_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1
@@ -5571,43 +5665,45 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1164-LABEL: umin_i32_varying:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: s_mov_b32 s4, -1
-; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s8, v1, s5
; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX1164-NEXT: v_writelane_b32 v1, s4, s5
-; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT: v_writelane_b32 v0, s4, s5
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_min_u32 s4, s4, s8
-; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB23_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
-; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX1164-NEXT: ds_min_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB23_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1
+; GFX1164-NEXT: v_min_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -5617,41 +5713,42 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1132-LABEL: umin_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, -1
+; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s5, v1, s4
; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
-; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
-; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
-; GFX1132-NEXT: s_min_u32 s2, s2, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT: v_writelane_b32 v0, s0, s4
+; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_min_u32 s0, s0, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX1132-NEXT: s_cbranch_execz .LBB23_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-NEXT: v_mov_b32_e32 v2, s2
-; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX1132-NEXT: ds_min_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB23_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1
+; GFX1132-NEXT: v_min_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -5674,7 +5771,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
@@ -5684,8 +5781,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB24_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
@@ -5707,7 +5804,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB24_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: v_mov_b32_e32 v0, 5
@@ -5717,8 +5814,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB24_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5740,7 +5837,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB24_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, 5
@@ -5749,8 +5846,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB24_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5772,7 +5869,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB24_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, 5
@@ -5783,8 +5880,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB24_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5803,7 +5901,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB24_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, 5
@@ -5814,8 +5912,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB24_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
@@ -5836,7 +5935,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_cbranch_execz .LBB24_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v0, 5
@@ -5846,8 +5945,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB24_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5870,7 +5969,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132-NEXT: s_cbranch_execz .LBB24_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v0, 5
@@ -5879,8 +5978,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB24_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
@@ -5901,3 +6000,5 @@ entry:
store i64 %old, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}