diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll | 461 |
1 files changed, 461 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll new file mode 100644 index 000000000000..afd271c99577 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -0,0 +1,461 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s + +; Simples case, if - then, that requires lane mask merging, +; %phi lane mask will hold %val_A at %A. Lanes that are active in %B +; will overwrite its own lane bit in lane mask with val_B +define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; GFX10-LABEL: divergent_i1_phi_if_then: +; GFX10: ; %bb.0: ; %A +; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: ; %bb.1: ; %B +; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2 +; GFX10-NEXT: ; %bb.2: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +A: + %val_A = icmp uge i32 %tid, 6 + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %B, label %exit + +B: + %val_B = icmp ult i32 %tid, 1 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %sel = select i1 %phi, i32 1, i32 2 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; if - else +define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; GFX10-LABEL: divergent_i1_phi_if_else: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: ; %bb.1: ; %B +; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 2, v2 +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: ; %bb.3: ; %A +; GFX10-NEXT: v_cmp_le_u32_e64 s0, 1, v2 +; GFX10-NEXT: ; %bb.4: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 2, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %sel = select i1 %phi, i32 1, i32 2 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; if - break; + +; counter = 0; +; do { +; if (a[counter] == 0) +; break; +; if (b[counter] == 0) +; break; +; if (c[counter] == 0) +; break; +; x[counter++]+=1; +; } while (counter<100); + +; Tests with multiple break conditions. Divergent phis will be used to track +; if any of the break conditions was reached. We only need to do simple lane +; mask merging (for current loop iteration only). There is an intrinsic, +; if_break, that will merge lane masks across all iterations of the loop. + +define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) { +; GFX10-LABEL: loop_with_1break: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_branch .LBB2_2 +; GFX10-NEXT: .LBB2_1: ; %Flow +; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execz .LBB2_4 +; GFX10-NEXT: .LBB2_2: ; %A +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5] +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo +; GFX10-NEXT: global_load_dword v7, v[7:8], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB2_1 +; GFX10-NEXT: ; %bb.3: ; %loop.body +; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v4 +; GFX10-NEXT: global_load_dword v7, v[5:6], off +; GFX10-NEXT: v_mov_b32_e32 v4, v8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7 +; GFX10-NEXT: global_store_dword v[5:6], v7, off +; GFX10-NEXT: s_branch .LBB2_1 +; GFX10-NEXT: .LBB2_4: ; %exit +; GFX10-NEXT: s_endpgm +entry: + br label %A + +A: + %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] + %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter + %a.val = load i32, ptr addrspace(1) %a.plus.counter + %a.cond = icmp eq i32 %a.val, 0 + br i1 %a.cond, label %exit, label %loop.body + +loop.body: + %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter + %x.val = load i32, ptr addrspace(1) %x.plus.counter + %x.val.plus.1 = add i32 %x.val, 1 + store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter + %counter.plus.1 = add i32 %counter, 1 + %x.cond = icmp ult i32 %counter, 100 + br i1 %x.cond, label %exit, label %A + +exit: + ret void +} + +define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; GFX10-LABEL: loop_with_2breaks: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: s_branch .LBB3_3 +; GFX10-NEXT: .LBB3_1: ; %Flow3 +; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB3_2: ; %Flow +; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: .LBB3_3: ; %A +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo +; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: ; %bb.4: ; %B +; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo +; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB3_1 +; GFX10-NEXT: ; %bb.5: ; %loop.body +; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 +; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6 +; GFX10-NEXT: global_load_dword v9, v[7:8], off +; GFX10-NEXT: v_mov_b32_e32 v6, v10 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 +; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_branch .LBB3_1 +; GFX10-NEXT: .LBB3_6: ; %exit +; GFX10-NEXT: s_endpgm +entry: + br label %A + +A: + %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] + %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter + %a.val = load i32, ptr addrspace(1) %a.plus.counter + %a.cond = icmp eq i32 %a.val, 0 + br i1 %a.cond, label %exit, label %B + +B: + %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter + %b.val = load i32, ptr addrspace(1) %b.plus.counter + %b.cond = icmp eq i32 %b.val, 0 + br i1 %b.cond, label %exit, label %loop.body + +loop.body: + %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter + %x.val = load i32, ptr addrspace(1) %x.plus.counter + %x.val.plus.1 = add i32 %x.val, 1 + store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter + %counter.plus.1 = add i32 %counter, 1 + %x.cond = icmp ult i32 %counter, 100 + br i1 %x.cond, label %exit, label %A + +exit: + ret void +} + +define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) { +; GFX10-LABEL: loop_with_3breaks: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: s_branch .LBB4_4 +; GFX10-NEXT: .LBB4_1: ; %Flow5 +; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_2: ; %Flow4 +; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB4_3: ; %Flow +; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execz .LBB4_8 +; GFX10-NEXT: .LBB4_4: ; %A +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9] +; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo +; GFX10-NEXT: global_load_dword v11, v[11:12], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-NEXT: ; %bb.5: ; %B +; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 +; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo +; GFX10-NEXT: global_load_dword v11, v[11:12], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB4_2 +; GFX10-NEXT: ; %bb.6: ; %C +; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 +; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo +; GFX10-NEXT: global_load_dword v11, v[11:12], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB4_1 +; GFX10-NEXT: ; %bb.7: ; %loop.body +; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v8 +; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v8 +; GFX10-NEXT: global_load_dword v11, v[9:10], off +; GFX10-NEXT: v_mov_b32_e32 v8, v12 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11 +; GFX10-NEXT: global_store_dword v[9:10], v11, off +; GFX10-NEXT: s_branch .LBB4_1 +; GFX10-NEXT: .LBB4_8: ; %exit +; GFX10-NEXT: s_endpgm +entry: + br label %A + +A: + %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] + %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter + %a.val = load i32, ptr addrspace(1) %a.plus.counter + %a.cond = icmp eq i32 %a.val, 0 + br i1 %a.cond, label %exit, label %B + +B: + %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter + %b.val = load i32, ptr addrspace(1) %b.plus.counter + %b.cond = icmp eq i32 %b.val, 0 + br i1 %b.cond, label %exit, label %C + +C: + %c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter + %c.val = load i32, ptr addrspace(1) %c.plus.counter + %c.cond = icmp eq i32 %c.val, 0 + br i1 %c.cond, label %exit, label %loop.body + +loop.body: + %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter + %x.val = load i32, ptr addrspace(1) %x.plus.counter + %x.val.plus.1 = add i32 %x.val, 1 + store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter + %counter.plus.1 = add i32 %counter, 1 + %x.cond = icmp ult i32 %counter, 100 + br i1 %x.cond, label %exit, label %A + +exit: + ret void +} + +; Divergent condition if with body, ending with break. This is loop with two +; exits but structurizer will create phi that will track exit from break +; and move break.body after the loop. Loop will then have one exit and phi +; used outside of the loop by condition used to enter the break.body. +define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) { +; GFX10-LABEL: loop_with_div_break_with_body: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: s_branch .LBB5_2 +; GFX10-NEXT: .LBB5_1: ; %Flow +; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, s2 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_and_b32 s1, 1, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: .LBB5_2: ; %A +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 1 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo +; GFX10-NEXT: global_load_dword v9, v[9:10], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB5_1 +; GFX10-NEXT: ; %bb.3: ; %loop.body +; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6 +; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: global_load_dword v9, v[7:8], off +; GFX10-NEXT: v_mov_b32_e32 v6, v10 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 +; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_branch .LBB5_1 +; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_and_saveexec_b32 s0, s1 +; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execz .LBB5_6 +; GFX10-NEXT: ; %bb.5: ; %break.body +; GFX10-NEXT: v_mov_b32_e32 v0, 10 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: .LBB5_6: ; %exit +; GFX10-NEXT: s_endpgm +entry: + br label %A + +A: + %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] + %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter + %a.val = load i32, ptr addrspace(1) %a.plus.counter + %a.cond = icmp eq i32 %a.val, 0 + br i1 %a.cond, label %break.body, label %loop.body + +break.body: + store i32 10, ptr addrspace(1) %a.break + br label %exit + + +loop.body: + %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter + %x.val = load i32, ptr addrspace(1) %x.plus.counter + %x.val.plus.1 = add i32 %x.val, 1 + store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter + %counter.plus.1 = add i32 %counter, 1 + %x.cond = icmp ult i32 %counter, 100 + br i1 %x.cond, label %exit, label %A + +exit: + ret void +} + +; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir +; with irreducible control flow graph. FixIrreducible converts it into natural +; loop and in the process creates i1 phi with three incoming values. + +; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) { +; do { +; if (y < a2) { +; do { +; } while (x < a2); +; } +; if (x < a3) { +; return a1; +; } +; } while (y < a2); +; return a0; +; } + +; This test is also interesting because it has phi with three incomings +;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) { +;.entry: +; %.y_lt_a2 = icmp sgt i32 %a2, %y +; %.x_lt_a2 = icmp sgt i32 %a2, %x +; %.x_lt_a3 = icmp sgt i32 %a3, %x +; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)' +; +;.preheader: ; if (y < a2), +; br label %.inner_loop +; +;.inner_loop: ; do while x < a2 +; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit +; +;.loopexit: ; if x < a3 +; %not.inner_loop = xor i1 %.y_lt_a2, true +; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)' +; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0 ; select retrun value a1 'x < a3' or a0 'loop ends' +; br i1 %brmerge, label %.exit, label %.preheader +; +;.exit: +; ret i32 %.ret +;} + |
