diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2025-01-13 11:22:55 +0700 |
|---|---|---|
| committer | Matt Arsenault <arsenm2@gmail.com> | 2025-11-11 16:47:47 -0800 |
| commit | afe4cffcca614618efd300a790c17d322c12f4bc (patch) | |
| tree | 806463a80fa08eca1b285ef594e2525f937b7da1 /llvm/test/CodeGen/AMDGPU/mul_int24.ll | |
| parent | 79d9ae7a777a03452991d222642ffdb6687d9210 (diff) | |
DAG: Handle load in SimplifyDemandedVectorEltsusers/arsenm/dag/simplify-demanded-vector-elts-load
This improves some AMDGPU cases and avoids future regressions.
The combiner likes to form shuffles for cases where an extract_vector_elt
would do perfectly well, and this recovers some of the regressions from
losing load narrowing.
AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have
some improvements, but mostly regressions. In particular X86 looks much
worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong.
I mostly just regenerated the checks. I assume some set of them should
switch to use volatile loads to defeat the optimization.
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/mul_int24.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/mul_int24.ll | 49 |
1 files changed, 25 insertions, 24 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 10d4eb029ee3..36dabd858c70 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -459,18 +459,18 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s5, s6, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s5, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_mul_i32 s5, s4, s5 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -478,11 +478,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s3, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s3, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s3, s3, 0x180000 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 ; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 @@ -569,28 +570,28 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s7, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b64 s[6:7], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dword s7, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm |
