diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2024-07-15 09:59:07 +0400 |
|---|---|---|
| committer | Matt Arsenault <arsenm2@gmail.com> | 2024-07-15 11:51:44 +0400 |
| commit | b1bcb7ca460fcd317bbc8309e14c8761bf8394e0 (patch) | |
| tree | cf2636217534435b2de9783a7cf8e9325819e658 /llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll | |
| parent | 71051deff27928cff908ea794e09806eee662801 (diff) | |
Reapply "AMDGPU: Move attributor into optimization pipeline (#83131)" and follow up commit "clang/AMDGPU: Defeat attribute optimization in attribute test" (#98851)
This reverts commit adaff46d087799072438dd744b038e6fd50a2d78.
Drop the -O3 checks from default-attributes.hip. I don't know why they
are different on some bots but reverting this is far too disruptive.
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll | 2219 |
1 files changed, 1160 insertions, 1059 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index b0b40aa952a9..3784af443c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -35,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -52,7 +52,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -90,8 +90,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -107,7 +107,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -119,8 +119,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -131,24 +132,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -160,7 +162,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -177,8 +179,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -192,24 +194,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -232,12 +234,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -249,8 +251,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -262,13 +264,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -280,8 +282,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -293,13 +295,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -310,8 +312,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -323,13 +325,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -342,8 +344,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -354,39 +357,40 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s2, s4 +; GFX1032-NEXT: s_mul_i32 s4, s0, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -404,8 +408,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -419,9 +423,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -430,22 +434,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s2, s4 +; GFX1132-NEXT: s_mul_i32 s4, s0, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] +; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -460,7 +464,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: add_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -473,27 +477,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -502,8 +506,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -514,27 +518,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -542,8 +546,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -554,26 +558,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -583,8 +587,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -595,36 +600,37 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_add_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_add_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB2_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -635,43 +641,45 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -681,41 +689,42 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_add_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_add_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -854,16 +863,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: s_add_i32 s2, s2, s6 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 @@ -886,16 +896,17 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: s_add_i32 s0, s0, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 @@ -929,7 +940,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -940,8 +951,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -962,7 +973,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -973,10 +984,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -994,7 +1005,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1004,10 +1015,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1025,7 +1036,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1037,8 +1048,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] @@ -1050,24 +1062,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1080,7 +1093,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1097,8 +1110,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1113,25 +1126,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v0, s1 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1155,7 +1168,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1196,7 +1209,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1234,7 +1247,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1272,7 +1285,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1308,7 +1321,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1343,7 +1356,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1384,7 +1397,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1432,7 +1445,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: add_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1447,7 +1460,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1459,7 +1472,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9-LABEL: add_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1471,7 +1484,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX10-LABEL: add_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1481,20 +1494,36 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX1164-LABEL: add_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -1513,7 +1542,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1524,8 +1553,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1542,7 +1571,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1553,8 +1582,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -1571,7 +1600,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1581,8 +1610,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -1599,7 +1628,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1611,8 +1640,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -1624,24 +1654,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -1654,7 +1685,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1671,8 +1702,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -1687,24 +1718,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -1728,12 +1759,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1745,8 +1776,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1758,13 +1789,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1776,8 +1807,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -1789,13 +1820,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1806,8 +1837,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -1819,13 +1850,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1838,8 +1869,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1851,40 +1882,40 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s2, s4 +; GFX1032-NEXT: s_mul_i32 s4, s0, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1902,8 +1933,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -1918,9 +1949,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1929,23 +1960,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s2, s4 +; GFX1132-NEXT: s_mul_i32 s4, s0, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -1960,7 +1991,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: sub_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,27 +2004,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB9_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB9_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -2002,8 +2033,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -2014,27 +2045,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB9_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2042,8 +2073,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -2054,26 +2085,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2083,8 +2114,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -2095,36 +2127,37 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_add_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_add_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -2135,43 +2168,45 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB9_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -2181,41 +2216,42 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_add_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_add_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB9_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -2354,16 +2390,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 ; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: s_add_i32 s2, s2, s6 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 @@ -2386,16 +2423,17 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: s_add_i32 s0, s0, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 @@ -2429,7 +2467,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2440,8 +2478,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -2462,7 +2500,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2473,8 +2511,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2495,7 +2533,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2505,8 +2543,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2527,7 +2565,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2539,8 +2577,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -2555,24 +2594,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -2588,7 +2628,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -2605,8 +2645,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -2624,25 +2664,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v0, s1 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -2669,7 +2709,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2710,7 +2750,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2749,7 +2789,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2789,7 +2829,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2828,7 +2868,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2866,7 +2906,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2909,7 +2949,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2959,7 +2999,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: sub_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2974,7 +3014,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2986,7 +3026,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9-LABEL: sub_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2998,7 +3038,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX10-LABEL: sub_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3008,20 +3048,36 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sub_i64_varying: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX1164-LABEL: sub_i64_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i64_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3035,7 +3091,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: and_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,27 +3104,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB14_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB14_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3077,8 +3133,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB14_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3089,27 +3145,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB14_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3117,8 +3173,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB14_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3129,26 +3185,26 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_and_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3158,8 +3214,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 @@ -3170,36 +3227,37 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_and_b32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_and_b32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 @@ -3210,43 +3268,45 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_and_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1164-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB14_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3256,41 +3316,42 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_and_b32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_and_b32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB14_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB14_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3309,7 +3370,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: or_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3322,27 +3383,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB15_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3351,8 +3412,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB15_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3363,27 +3424,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_or_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB15_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3391,8 +3452,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB15_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3403,26 +3464,26 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_or_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB15_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3432,8 +3493,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 @@ -3444,36 +3506,37 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_or_b32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_or_b32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB15_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 @@ -3484,43 +3547,45 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_or_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1164-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB15_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3530,41 +3595,42 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_or_b32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_or_b32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB15_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3583,7 +3649,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: xor_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3596,27 +3662,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_xor_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB16_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3625,8 +3691,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB16_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3637,27 +3703,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3665,8 +3731,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3677,26 +3743,26 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_xor_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3706,8 +3772,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -3718,36 +3785,37 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_xor_b32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_xor_b32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -3758,43 +3826,45 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_xor_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1164-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB16_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3804,41 +3874,42 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_xor_b32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_xor_b32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB16_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3857,7 +3928,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: max_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,27 +3941,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_brev_b32 s4, 1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB17_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_max_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB17_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3899,8 +3970,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB17_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3911,27 +3982,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_brev_b32 s4, 1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_max_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3939,8 +4010,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB17_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3951,26 +4022,26 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_brev_b32 s4, 1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_max_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3980,8 +4051,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 @@ -3992,36 +4064,37 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_brev_b32 s2, 1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_brev_b32 s0, 1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_max_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_max_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB17_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 @@ -4032,43 +4105,45 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_brev_b32 s4, 1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_max_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1164-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB17_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4078,41 +4153,42 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_brev_b32 s2, 1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_brev_b32 s0, 1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_max_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_max_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB17_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB17_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4135,7 +4211,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4145,8 +4221,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB18_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -4169,7 +4245,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -4179,10 +4255,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB18_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -4203,7 +4279,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -4212,10 +4288,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB18_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -4236,7 +4312,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -4247,8 +4323,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4267,7 +4344,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -4278,8 +4355,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4300,7 +4378,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -4310,8 +4388,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB18_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4334,7 +4412,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -4343,8 +4421,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB18_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4371,7 +4449,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: min_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4384,27 +4462,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB19_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_min_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB19_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -4413,8 +4491,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB19_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -4425,27 +4503,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB19_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_min_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB19_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4453,8 +4531,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB19_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4465,26 +4543,26 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_brev_b32 s4, -2 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_min_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB19_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -4494,8 +4572,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 @@ -4506,36 +4585,37 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_brev_b32 s2, -2 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_brev_b32 s0, -2 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_min_i32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_min_i32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB19_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 @@ -4546,43 +4626,45 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_brev_b32 s4, -2 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_min_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB19_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1164-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB19_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4592,41 +4674,42 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_brev_b32 s2, -2 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_brev_b32 s0, -2 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_min_i32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_min_i32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB19_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB19_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4649,7 +4732,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4659,8 +4742,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB20_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -4683,7 +4766,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -4693,10 +4776,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB20_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -4717,7 +4800,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -4726,10 +4809,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB20_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -4750,7 +4833,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -4761,8 +4844,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -4781,7 +4865,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -4792,8 +4876,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -4814,7 +4899,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -4824,8 +4909,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB20_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -4848,7 +4933,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -4857,8 +4942,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB20_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -4885,7 +4970,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umax_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,27 +4983,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_max_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB21_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -4927,8 +5012,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB21_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -4939,27 +5024,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_max_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB21_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4967,8 +5052,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB21_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4979,26 +5064,26 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_max_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB21_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5008,8 +5093,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 @@ -5020,36 +5106,37 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_max_u32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_max_u32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB21_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 @@ -5060,43 +5147,45 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_max_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB21_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB21_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5106,41 +5195,42 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_max_u32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_max_u32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB21_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB21_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5163,7 +5253,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -5173,8 +5263,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB22_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -5196,7 +5286,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -5206,8 +5296,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB22_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5229,7 +5319,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -5238,8 +5328,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB22_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5261,7 +5351,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -5272,8 +5362,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5292,7 +5383,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -5303,8 +5394,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5325,7 +5417,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -5335,8 +5427,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB22_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5359,7 +5451,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -5368,8 +5460,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB22_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -5396,7 +5488,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umin_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5409,27 +5501,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_min_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB23_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -5438,8 +5530,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB23_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -5450,27 +5542,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_min_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB23_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -5478,8 +5570,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -5490,26 +5582,26 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064-NEXT: s_min_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5519,8 +5611,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 @@ -5531,36 +5624,37 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 -; GFX1032-NEXT: s_min_u32 s2, s2, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032-NEXT: s_min_u32 s0, s0, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s0 ; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 @@ -5571,43 +5665,45 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: s_min_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1164-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB23_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5617,41 +5713,42 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 -; GFX1132-NEXT: s_min_u32 s2, s2, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_min_u32 s0, s0, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB23_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 -; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB23_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5674,7 +5771,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -5684,8 +5781,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB24_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -5707,7 +5804,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -5717,8 +5814,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB24_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5740,7 +5837,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -5749,8 +5846,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB24_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5772,7 +5869,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -5783,8 +5880,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5803,7 +5901,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -5814,8 +5912,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -5836,7 +5935,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB24_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -5846,8 +5945,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB24_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5870,7 +5969,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -5879,8 +5978,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB24_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -5901,3 +6000,5 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} |
