diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll | 585 |
1 files changed, 225 insertions, 360 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f5506f..6167a84094b7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 |
