DAG: Handle load in SimplifyDemandedVectorEltsusers/arsenm/dag/simplify-demanded-vector-elts-load

This improves some AMDGPU cases and avoids future regressions. The combiner likes to form shuffles for cases where an extract_vector_elt would do perfectly well, and this recovers some of the regressions from losing load narrowing. AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have some improvements, but mostly regressions. In particular X86 looks much worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong. I mostly just regenerated the checks. I assume some set of them should switch to use volatile loads to defeat the optimization.
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2025-01-13 11:22:55 +0700
committer: Matt Arsenault <arsenm2@gmail.com> 2025-11-11 16:47:47 -0800
commit: afe4cffcca614618efd300a790c17d322c12f4bc (patch)
tree: 806463a80fa08eca1b285ef594e2525f937b7da1 /llvm/test/CodeGen/AMDGPU/mul_int24.ll
parent: 79d9ae7a777a03452991d222642ffdb6687d9210 (diff)
1 files changed, 25 insertions, 24 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 10d4eb029ee3..36dabd858c70 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -459,18 +459,18 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
 define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
 ; SI-LABEL: test_smul24_i33:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_i32 s5, s6, 0x180000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_mul_i32 s4, s5, s4
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    s_mul_i32 s5, s4, s5
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s4, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -478,11 +478,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
 ;
 ; VI-LABEL: test_smul24_i33:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x180000
-; VI-NEXT:    s_bfe_i32 s3, s4, 0x180000
+; VI-NEXT:    s_bfe_i32 s3, s3, 0x180000
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
 ; VI-NEXT:    v_mul_i32_i24_e32 v0, s2, v0
@@ -569,28 +570,28 @@ entry:
 define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
 ; SI-LABEL: test_smulhi24_i33:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
+; SI-NEXT:    s_load_dword s7, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s7, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_smulhi24_i33:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s7, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2025-01-13 11:22:55 +0700
committer	Matt Arsenault <arsenm2@gmail.com>	2025-11-11 16:47:47 -0800
commit	afe4cffcca614618efd300a790c17d322c12f4bc (patch)
tree	806463a80fa08eca1b285ef594e2525f937b7da1 /llvm/test/CodeGen/AMDGPU/mul_int24.ll
parent	79d9ae7a777a03452991d222642ffdb6687d9210 (diff)