summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll585
1 files changed, 225 insertions, 360 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f5506f..6167a84094b7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0